And the new files required for it to actually work! Hurrr.
This commit is contained in:
@@ -45,6 +45,83 @@ const U32 LLREFCOUNT_SENTINEL_VALUE = 0xAAAAAAAA;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if LL_DEBUG
|
||||
inline void* ll_aligned_malloc( size_t size, int align )
|
||||
{
|
||||
void* mem = malloc( size + (align - 1) + sizeof(void*) );
|
||||
char* aligned = ((char*)mem) + sizeof(void*);
|
||||
aligned += align - ((uintptr_t)aligned & (align - 1));
|
||||
|
||||
((void**)aligned)[-1] = mem;
|
||||
return aligned;
|
||||
}
|
||||
|
||||
inline void ll_aligned_free( void* ptr )
|
||||
{
|
||||
free( ((void**)ptr)[-1] );
|
||||
}
|
||||
|
||||
inline void* ll_aligned_malloc_16(size_t size) // returned hunk MUST be freed with ll_aligned_free_16().
|
||||
{
|
||||
#if defined(LL_WINDOWS)
|
||||
return _mm_malloc(size, 16);
|
||||
#elif defined(LL_DARWIN)
|
||||
return malloc(size); // default osx malloc is 16 byte aligned.
|
||||
#else
|
||||
void *rtn;
|
||||
if (LL_LIKELY(0 == posix_memalign(&rtn, 16, size)))
|
||||
return rtn;
|
||||
else // bad alignment requested, or out of memory
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void ll_aligned_free_16(void *p)
|
||||
{
|
||||
#if defined(LL_WINDOWS)
|
||||
_mm_free(p);
|
||||
#elif defined(LL_DARWIN)
|
||||
return free(p);
|
||||
#else
|
||||
free(p); // posix_memalign() is compatible with heap deallocator
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void* ll_aligned_malloc_32(size_t size) // returned hunk MUST be freed with ll_aligned_free_32().
|
||||
{
|
||||
#if defined(LL_WINDOWS)
|
||||
return _mm_malloc(size, 32);
|
||||
#elif defined(LL_DARWIN)
|
||||
return ll_aligned_malloc( size, 32 );
|
||||
#else
|
||||
void *rtn;
|
||||
if (LL_LIKELY(0 == posix_memalign(&rtn, 32, size)))
|
||||
return rtn;
|
||||
else // bad alignment requested, or out of memory
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void ll_aligned_free_32(void *p)
|
||||
{
|
||||
#if defined(LL_WINDOWS)
|
||||
_mm_free(p);
|
||||
#elif defined(LL_DARWIN)
|
||||
ll_aligned_free( p );
|
||||
#else
|
||||
free(p); // posix_memalign() is compatible with heap deallocator
|
||||
#endif
|
||||
}
|
||||
#else // LL_DEBUG
|
||||
// ll_aligned_foo are noops now that we use tcmalloc everywhere (tcmalloc aligns automatically at appropriate intervals)
|
||||
#define ll_aligned_malloc( size, align ) malloc(size)
|
||||
#define ll_aligned_free( ptr ) free(ptr)
|
||||
#define ll_aligned_malloc_16 malloc
|
||||
#define ll_aligned_free_16 free
|
||||
#define ll_aligned_malloc_32 malloc
|
||||
#define ll_aligned_free_32 free
|
||||
#endif // LL_DEBUG
|
||||
|
||||
class LL_COMMON_API LLMemory
|
||||
{
|
||||
public:
|
||||
|
||||
134
indra/llmath/llmatrix3a.cpp
Normal file
134
indra/llmath/llmatrix3a.cpp
Normal file
@@ -0,0 +1,134 @@
|
||||
/**
|
||||
* @file llvector4a.cpp
|
||||
* @brief SIMD vector implementation
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#include "llmath.h"
|
||||
|
||||
static LL_ALIGN_16(const F32 M_IDENT_3A[12]) =
|
||||
{ 1.f, 0.f, 0.f, 0.f, // Column 1
|
||||
0.f, 1.f, 0.f, 0.f, // Column 2
|
||||
0.f, 0.f, 1.f, 0.f }; // Column 3
|
||||
|
||||
extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A);
|
||||
|
||||
void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs )
|
||||
{
|
||||
const LLVector4a col0 = lhs.getColumn(0);
|
||||
const LLVector4a col1 = lhs.getColumn(1);
|
||||
const LLVector4a col2 = lhs.getColumn(2);
|
||||
|
||||
for ( int i = 0; i < 3; i++ )
|
||||
{
|
||||
LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() );
|
||||
xxxx.splat<0>( xxxx );
|
||||
xxxx.mul( col0 );
|
||||
|
||||
{
|
||||
LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 1 );
|
||||
yyyy.splat<0>( yyyy );
|
||||
yyyy.mul( col1 );
|
||||
xxxx.add( yyyy );
|
||||
}
|
||||
|
||||
{
|
||||
LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 2 );
|
||||
zzzz.splat<0>( zzzz );
|
||||
zzzz.mul( col2 );
|
||||
xxxx.add( zzzz );
|
||||
}
|
||||
|
||||
xxxx.store4a( mColumns[i].getF32ptr() );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst )
|
||||
{
|
||||
const LLVector4a col0 = xform.getColumn(0);
|
||||
const LLVector4a col1 = xform.getColumn(1);
|
||||
const LLVector4a col2 = xform.getColumn(2);
|
||||
const LLVector4a* maxAddr = src + numVectors;
|
||||
|
||||
if ( numVectors & 0x1 )
|
||||
{
|
||||
LLVector4a xxxx = _mm_load_ss( (const F32*)src );
|
||||
LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
|
||||
LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 );
|
||||
xxxx.splat<0>( xxxx );
|
||||
yyyy.splat<0>( yyyy );
|
||||
zzzz.splat<0>( zzzz );
|
||||
xxxx.mul( col0 );
|
||||
yyyy.mul( col1 );
|
||||
zzzz.mul( col2 );
|
||||
xxxx.add( yyyy );
|
||||
xxxx.add( zzzz );
|
||||
xxxx.store4a( (F32*)dst );
|
||||
src++;
|
||||
dst++;
|
||||
}
|
||||
|
||||
|
||||
numVectors >>= 1;
|
||||
while ( src < maxAddr )
|
||||
{
|
||||
_mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA );
|
||||
_mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA );
|
||||
LLVector4a xxxx = _mm_load_ss( (const F32*)src );
|
||||
LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) );
|
||||
|
||||
xxxx.splat<0>( xxxx );
|
||||
xxxx1.splat<0>( xxxx1 );
|
||||
xxxx.mul( col0 );
|
||||
xxxx1.mul( col0 );
|
||||
|
||||
{
|
||||
LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
|
||||
LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1);
|
||||
yyyy.splat<0>( yyyy );
|
||||
yyyy1.splat<0>( yyyy1 );
|
||||
yyyy.mul( col1 );
|
||||
yyyy1.mul( col1 );
|
||||
xxxx.add( yyyy );
|
||||
xxxx1.add( yyyy1 );
|
||||
}
|
||||
|
||||
{
|
||||
LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 );
|
||||
LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 );
|
||||
zzzz.splat<0>( zzzz );
|
||||
zzzz1.splat<0>( zzzz1 );
|
||||
zzzz.mul( col2 );
|
||||
zzzz1.mul( col2 );
|
||||
xxxx.add( zzzz );
|
||||
xxxx1.add( zzzz1 );
|
||||
}
|
||||
|
||||
xxxx.store4a(dst->getF32ptr());
|
||||
src++;
|
||||
dst++;
|
||||
|
||||
xxxx1.store4a((F32*)dst++);
|
||||
}
|
||||
}
|
||||
128
indra/llmath/llmatrix3a.h
Normal file
128
indra/llmath/llmatrix3a.h
Normal file
@@ -0,0 +1,128 @@
|
||||
/**
|
||||
* @file llmatrix3a.h
|
||||
* @brief LLMatrix3a class header file - memory aligned and vectorized 3x3 matrix
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_LLMATRIX3A_H
|
||||
#define LL_LLMATRIX3A_H
|
||||
|
||||
/////////////////////////////
|
||||
// LLMatrix3a, LLRotation
|
||||
/////////////////////////////
|
||||
// This class stores a 3x3 (technically 4x3) matrix in column-major order
|
||||
/////////////////////////////
|
||||
/////////////////////////////
|
||||
// These classes are intentionally minimal right now. If you need additional
|
||||
// functionality, please contact someone with SSE experience (e.g., Falcon or
|
||||
// Huseby).
|
||||
/////////////////////////////
|
||||
|
||||
// LLMatrix3a is the base class for LLRotation, which should be used instead any time you're dealing with a
|
||||
// rotation matrix.
|
||||
class LLMatrix3a
|
||||
{
|
||||
public:
|
||||
|
||||
// Utility function for quickly transforming an array of LLVector4a's
|
||||
// For transforming a single LLVector4a, see LLVector4a::setRotated
|
||||
static void batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst );
|
||||
|
||||
// Utility function to obtain the identity matrix
|
||||
static inline const LLMatrix3a& getIdentity();
|
||||
|
||||
//////////////////////////
|
||||
// Ctors
|
||||
//////////////////////////
|
||||
|
||||
// Ctor
|
||||
LLMatrix3a() {}
|
||||
|
||||
// Ctor for setting by columns
|
||||
inline LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 );
|
||||
|
||||
//////////////////////////
|
||||
// Get/Set
|
||||
//////////////////////////
|
||||
|
||||
// Loads from an LLMatrix3
|
||||
inline void loadu(const LLMatrix3& src);
|
||||
|
||||
// Set rows
|
||||
inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2);
|
||||
|
||||
// Set columns
|
||||
inline void setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2);
|
||||
|
||||
// Get the read-only access to a specified column. Valid columns are 0-2, but the
|
||||
// function is unchecked. You've been warned.
|
||||
inline const LLVector4a& getColumn(const U32 column) const;
|
||||
|
||||
/////////////////////////
|
||||
// Matrix modification
|
||||
/////////////////////////
|
||||
|
||||
// Set this matrix to the product of lhs and rhs ( this = lhs * rhs )
|
||||
void setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs );
|
||||
|
||||
// Set this matrix to the transpose of src
|
||||
inline void setTranspose(const LLMatrix3a& src);
|
||||
|
||||
// Set this matrix to a*w + b*(1-w)
|
||||
inline void setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w);
|
||||
|
||||
/////////////////////////
|
||||
// Matrix inspection
|
||||
/////////////////////////
|
||||
|
||||
// Sets all 4 elements in 'dest' to the determinant of this matrix.
|
||||
// If you will be using the determinant in subsequent ops with LLVector4a, use this version
|
||||
inline void getDeterminant( LLVector4a& dest ) const;
|
||||
|
||||
// Returns the determinant as an LLSimdScalar. Use this if you will be using the determinant
|
||||
// primary for scalar operations.
|
||||
inline LLSimdScalar getDeterminant() const;
|
||||
|
||||
// Returns nonzero if rows 0-2 and colums 0-2 contain no NaN or INF values. Row 3 is ignored
|
||||
inline LLBool32 isFinite() const;
|
||||
|
||||
// Returns true if this matrix is equal to 'rhs' up to 'tolerance'
|
||||
inline bool isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
|
||||
|
||||
protected:
|
||||
|
||||
LLVector4a mColumns[3];
|
||||
|
||||
};
|
||||
|
||||
class LLRotation : public LLMatrix3a
|
||||
{
|
||||
public:
|
||||
|
||||
LLRotation() {}
|
||||
|
||||
// Returns true if this rotation is orthonormal with det ~= 1
|
||||
inline bool isOkRotation() const;
|
||||
};
|
||||
|
||||
#endif
|
||||
119
indra/llmath/llmatrix3a.inl
Normal file
119
indra/llmath/llmatrix3a.inl
Normal file
@@ -0,0 +1,119 @@
|
||||
/**
|
||||
* @file llmatrix3a.inl
|
||||
* @brief LLMatrix3a inline definitions
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#include "llmatrix3a.h"
|
||||
#include "m3math.h"
|
||||
|
||||
inline LLMatrix3a::LLMatrix3a( const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2 )
|
||||
{
|
||||
setColumns( c0, c1, c2 );
|
||||
}
|
||||
|
||||
inline void LLMatrix3a::loadu(const LLMatrix3& src)
|
||||
{
|
||||
mColumns[0].load3(src.mMatrix[0]);
|
||||
mColumns[1].load3(src.mMatrix[1]);
|
||||
mColumns[2].load3(src.mMatrix[2]);
|
||||
}
|
||||
|
||||
inline void LLMatrix3a::setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2)
|
||||
{
|
||||
mColumns[0] = r0;
|
||||
mColumns[1] = r1;
|
||||
mColumns[2] = r2;
|
||||
setTranspose( *this );
|
||||
}
|
||||
|
||||
inline void LLMatrix3a::setColumns(const LLVector4a& c0, const LLVector4a& c1, const LLVector4a& c2)
|
||||
{
|
||||
mColumns[0] = c0;
|
||||
mColumns[1] = c1;
|
||||
mColumns[2] = c2;
|
||||
}
|
||||
|
||||
inline void LLMatrix3a::setTranspose(const LLMatrix3a& src)
|
||||
{
|
||||
const LLQuad srcCol0 = src.mColumns[0];
|
||||
const LLQuad srcCol1 = src.mColumns[1];
|
||||
const LLQuad unpacklo = _mm_unpacklo_ps( srcCol0, srcCol1 );
|
||||
mColumns[0] = _mm_movelh_ps( unpacklo, src.mColumns[2] );
|
||||
mColumns[1] = _mm_shuffle_ps( _mm_movehl_ps( srcCol0, unpacklo ), src.mColumns[2], _MM_SHUFFLE(0, 1, 1, 0) );
|
||||
mColumns[2] = _mm_shuffle_ps( _mm_unpackhi_ps( srcCol0, srcCol1 ), src.mColumns[2], _MM_SHUFFLE(0, 2, 1, 0) );
|
||||
}
|
||||
|
||||
inline const LLVector4a& LLMatrix3a::getColumn(const U32 column) const
|
||||
{
|
||||
llassert( column < 3 );
|
||||
return mColumns[column];
|
||||
}
|
||||
|
||||
inline void LLMatrix3a::setLerp(const LLMatrix3a& a, const LLMatrix3a& b, F32 w)
|
||||
{
|
||||
mColumns[0].setLerp( a.mColumns[0], b.mColumns[0], w );
|
||||
mColumns[1].setLerp( a.mColumns[1], b.mColumns[1], w );
|
||||
mColumns[2].setLerp( a.mColumns[2], b.mColumns[2], w );
|
||||
}
|
||||
|
||||
inline LLBool32 LLMatrix3a::isFinite() const
|
||||
{
|
||||
return mColumns[0].isFinite3() && mColumns[1].isFinite3() && mColumns[2].isFinite3();
|
||||
}
|
||||
|
||||
inline void LLMatrix3a::getDeterminant( LLVector4a& dest ) const
|
||||
{
|
||||
LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] );
|
||||
dest.setAllDot3( col1xcol2, mColumns[0] );
|
||||
}
|
||||
|
||||
inline LLSimdScalar LLMatrix3a::getDeterminant() const
|
||||
{
|
||||
LLVector4a col1xcol2; col1xcol2.setCross3( mColumns[1], mColumns[2] );
|
||||
return col1xcol2.dot3( mColumns[0] );
|
||||
}
|
||||
|
||||
inline bool LLMatrix3a::isApproximatelyEqual( const LLMatrix3a& rhs, F32 tolerance /*= F_APPROXIMATELY_ZERO*/ ) const
|
||||
{
|
||||
return rhs.getColumn(0).equals3(mColumns[0], tolerance)
|
||||
&& rhs.getColumn(1).equals3(mColumns[1], tolerance)
|
||||
&& rhs.getColumn(2).equals3(mColumns[2], tolerance);
|
||||
}
|
||||
|
||||
inline const LLMatrix3a& LLMatrix3a::getIdentity()
|
||||
{
|
||||
extern const LLMatrix3a LL_M3A_IDENTITY;
|
||||
return LL_M3A_IDENTITY;
|
||||
}
|
||||
|
||||
inline bool LLRotation::isOkRotation() const
|
||||
{
|
||||
LLMatrix3a transpose; transpose.setTranspose( *this );
|
||||
LLMatrix3a product; product.setMul( *this, transpose );
|
||||
|
||||
LLSimdScalar detMinusOne = getDeterminant() - 1.f;
|
||||
|
||||
return product.isApproximatelyEqual( LLMatrix3a::getIdentity() ) && (detMinusOne.getAbs() < F_APPROXIMATELY_ZERO);
|
||||
}
|
||||
|
||||
143
indra/llmath/llmatrix4a.h
Normal file
143
indra/llmath/llmatrix4a.h
Normal file
@@ -0,0 +1,143 @@
|
||||
/**
|
||||
* @file llmatrix4a.h
|
||||
* @brief LLMatrix4a class header file - memory aligned and vectorized 4x4 matrix
|
||||
*
|
||||
* $LicenseInfo:firstyear=2007&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_LLMATRIX4A_H
|
||||
#define LL_LLMATRIX4A_H
|
||||
|
||||
#include "llvector4a.h"
|
||||
#include "m4math.h"
|
||||
#include "m3math.h"
|
||||
|
||||
class LLMatrix4a
|
||||
{
|
||||
public:
|
||||
LLVector4a mMatrix[4];
|
||||
|
||||
inline void clear()
|
||||
{
|
||||
mMatrix[0].clear();
|
||||
mMatrix[1].clear();
|
||||
mMatrix[2].clear();
|
||||
mMatrix[3].clear();
|
||||
}
|
||||
|
||||
inline void loadu(const LLMatrix4& src)
|
||||
{
|
||||
mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
|
||||
mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]);
|
||||
mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]);
|
||||
mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]);
|
||||
|
||||
}
|
||||
|
||||
inline void loadu(const LLMatrix3& src)
|
||||
{
|
||||
mMatrix[0].load3(src.mMatrix[0]);
|
||||
mMatrix[1].load3(src.mMatrix[1]);
|
||||
mMatrix[2].load3(src.mMatrix[2]);
|
||||
mMatrix[3].set(0,0,0,1.f);
|
||||
}
|
||||
|
||||
inline void add(const LLMatrix4a& rhs)
|
||||
{
|
||||
mMatrix[0].add(rhs.mMatrix[0]);
|
||||
mMatrix[1].add(rhs.mMatrix[1]);
|
||||
mMatrix[2].add(rhs.mMatrix[2]);
|
||||
mMatrix[3].add(rhs.mMatrix[3]);
|
||||
}
|
||||
|
||||
inline void setRows(const LLVector4a& r0, const LLVector4a& r1, const LLVector4a& r2)
|
||||
{
|
||||
mMatrix[0] = r0;
|
||||
mMatrix[1] = r1;
|
||||
mMatrix[2] = r2;
|
||||
}
|
||||
|
||||
inline void setMul(const LLMatrix4a& m, const F32 s)
|
||||
{
|
||||
mMatrix[0].setMul(m.mMatrix[0], s);
|
||||
mMatrix[1].setMul(m.mMatrix[1], s);
|
||||
mMatrix[2].setMul(m.mMatrix[2], s);
|
||||
mMatrix[3].setMul(m.mMatrix[3], s);
|
||||
}
|
||||
|
||||
inline void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w)
|
||||
{
|
||||
LLVector4a d0,d1,d2,d3;
|
||||
d0.setSub(b.mMatrix[0], a.mMatrix[0]);
|
||||
d1.setSub(b.mMatrix[1], a.mMatrix[1]);
|
||||
d2.setSub(b.mMatrix[2], a.mMatrix[2]);
|
||||
d3.setSub(b.mMatrix[3], a.mMatrix[3]);
|
||||
|
||||
// this = a + d*w
|
||||
|
||||
d0.mul(w);
|
||||
d1.mul(w);
|
||||
d2.mul(w);
|
||||
d3.mul(w);
|
||||
|
||||
mMatrix[0].setAdd(a.mMatrix[0],d0);
|
||||
mMatrix[1].setAdd(a.mMatrix[1],d1);
|
||||
mMatrix[2].setAdd(a.mMatrix[2],d2);
|
||||
mMatrix[3].setAdd(a.mMatrix[3],d3);
|
||||
}
|
||||
|
||||
inline void rotate(const LLVector4a& v, LLVector4a& res)
|
||||
{
|
||||
res = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
res.mul(mMatrix[0]);
|
||||
|
||||
LLVector4a y;
|
||||
y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
y.mul(mMatrix[1]);
|
||||
|
||||
LLVector4a z;
|
||||
z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
z.mul(mMatrix[2]);
|
||||
|
||||
res.add(y);
|
||||
res.add(z);
|
||||
}
|
||||
|
||||
inline void affineTransform(const LLVector4a& v, LLVector4a& res)
|
||||
{
|
||||
LLVector4a x,y,z;
|
||||
|
||||
x = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
y = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
z = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
x.mul(mMatrix[0]);
|
||||
y.mul(mMatrix[1]);
|
||||
z.mul(mMatrix[2]);
|
||||
|
||||
x.add(y);
|
||||
z.add(mMatrix[3]);
|
||||
res.setAdd(x,z);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
105
indra/llmath/llquaternion2.h
Normal file
105
indra/llmath/llquaternion2.h
Normal file
@@ -0,0 +1,105 @@
|
||||
/**
|
||||
* @file llquaternion2.h
|
||||
* @brief LLQuaternion2 class header file - SIMD-enabled quaternion class
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_QUATERNION2_H
|
||||
#define LL_QUATERNION2_H
|
||||
|
||||
/////////////////////////////
|
||||
// LLQuaternion2
|
||||
/////////////////////////////
|
||||
// This class stores a quaternion x*i + y*j + z*k + w in <x, y, z, w> order
|
||||
// (i.e., w in high order element of vector)
|
||||
/////////////////////////////
|
||||
/////////////////////////////
|
||||
// These classes are intentionally minimal right now. If you need additional
|
||||
// functionality, please contact someone with SSE experience (e.g., Falcon or
|
||||
// Huseby).
|
||||
/////////////////////////////
|
||||
#include "llquaternion.h"
|
||||
|
||||
class LLQuaternion2
|
||||
{
|
||||
public:
|
||||
|
||||
//////////////////////////
|
||||
// Ctors
|
||||
//////////////////////////
|
||||
|
||||
// Ctor
|
||||
LLQuaternion2() {}
|
||||
|
||||
// Ctor from LLQuaternion
|
||||
explicit LLQuaternion2( const class LLQuaternion& quat );
|
||||
|
||||
//////////////////////////
|
||||
// Get/Set
|
||||
//////////////////////////
|
||||
|
||||
// Load from an LLQuaternion
|
||||
inline void operator=( const LLQuaternion& quat )
|
||||
{
|
||||
mQ.loadua( quat.mQ );
|
||||
}
|
||||
|
||||
// Return the internal LLVector4a representation of the quaternion
|
||||
inline const LLVector4a& getVector4a() const;
|
||||
inline LLVector4a& getVector4aRw();
|
||||
|
||||
/////////////////////////
|
||||
// Quaternion modification
|
||||
/////////////////////////
|
||||
|
||||
// Set this quaternion to the conjugate of src
|
||||
inline void setConjugate(const LLQuaternion2& src);
|
||||
|
||||
// Renormalizes the quaternion. Assumes it has nonzero length.
|
||||
inline void normalize();
|
||||
|
||||
// Quantize this quaternion to 8 bit precision
|
||||
inline void quantize8();
|
||||
|
||||
// Quantize this quaternion to 16 bit precision
|
||||
inline void quantize16();
|
||||
|
||||
/////////////////////////
|
||||
// Quaternion inspection
|
||||
/////////////////////////
|
||||
|
||||
// Return true if this quaternion is equal to 'rhs'.
|
||||
// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid
|
||||
// quaternion representations and they will NOT compare equal.
|
||||
inline bool equals(const LLQuaternion2& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
|
||||
|
||||
// Return true if all components are finite and the quaternion is normalized
|
||||
inline bool isOkRotation() const;
|
||||
|
||||
protected:
|
||||
|
||||
LLVector4a mQ;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
102
indra/llmath/llquaternion2.inl
Normal file
102
indra/llmath/llquaternion2.inl
Normal file
@@ -0,0 +1,102 @@
|
||||
/**
|
||||
* @file llquaternion2.inl
|
||||
* @brief LLQuaternion2 inline definitions
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#include "llquaternion2.h"
|
||||
|
||||
static const LLQuad LL_V4A_PLUS_ONE = {1.f, 1.f, 1.f, 1.f};
|
||||
static const LLQuad LL_V4A_MINUS_ONE = {-1.f, -1.f, -1.f, -1.f};
|
||||
|
||||
// Ctor from LLQuaternion
|
||||
inline LLQuaternion2::LLQuaternion2( const LLQuaternion& quat )
|
||||
{
|
||||
mQ.set(quat.mQ[VX], quat.mQ[VY], quat.mQ[VZ], quat.mQ[VW]);
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
// Get/Set
|
||||
//////////////////////////
|
||||
|
||||
// Return the internal LLVector4a representation of the quaternion
|
||||
inline const LLVector4a& LLQuaternion2::getVector4a() const
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
|
||||
inline LLVector4a& LLQuaternion2::getVector4aRw()
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
|
||||
/////////////////////////
|
||||
// Quaternion modification
|
||||
/////////////////////////
|
||||
|
||||
// Set this quaternion to the conjugate of src
|
||||
inline void LLQuaternion2::setConjugate(const LLQuaternion2& src)
|
||||
{
|
||||
static LL_ALIGN_16( const U32 F_QUAT_INV_MASK_4A[4] ) = { 0x80000000, 0x80000000, 0x80000000, 0x00000000 };
|
||||
mQ = _mm_xor_ps(src.mQ, *reinterpret_cast<const LLQuad*>(&F_QUAT_INV_MASK_4A));
|
||||
}
|
||||
|
||||
// Renormalizes the quaternion. Assumes it has nonzero length.
|
||||
inline void LLQuaternion2::normalize()
|
||||
{
|
||||
mQ.normalize4();
|
||||
}
|
||||
|
||||
// Quantize this quaternion to 8 bit precision
|
||||
inline void LLQuaternion2::quantize8()
|
||||
{
|
||||
mQ.quantize8( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE );
|
||||
normalize();
|
||||
}
|
||||
|
||||
// Quantize this quaternion to 16 bit precision
|
||||
inline void LLQuaternion2::quantize16()
|
||||
{
|
||||
mQ.quantize16( LL_V4A_MINUS_ONE, LL_V4A_PLUS_ONE );
|
||||
normalize();
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////
|
||||
// Quaternion inspection
|
||||
/////////////////////////
|
||||
|
||||
// Return true if this quaternion is equal to 'rhs'.
|
||||
// Note! Quaternions exhibit "double-cover", so any rotation has two equally valid
|
||||
// quaternion representations and they will NOT compare equal.
|
||||
inline bool LLQuaternion2::equals(const LLQuaternion2 &rhs, F32 tolerance/* = F_APPROXIMATELY_ZERO*/) const
|
||||
{
|
||||
return mQ.equals4(rhs.mQ, tolerance);
|
||||
}
|
||||
|
||||
// Return true if all components are finite and the quaternion is normalized
|
||||
inline bool LLQuaternion2::isOkRotation() const
|
||||
{
|
||||
return mQ.isFinite4() && mQ.isNormalized4();
|
||||
}
|
||||
|
||||
93
indra/llmath/llsimdmath.h
Normal file
93
indra/llmath/llsimdmath.h
Normal file
@@ -0,0 +1,93 @@
|
||||
/**
|
||||
* @file llsimdmath.h
|
||||
* @brief Common header for SIMD-based math library (llvector4a, llmatrix3a, etc.)
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_SIMD_MATH_H
|
||||
#define LL_SIMD_MATH_H
|
||||
|
||||
#ifndef LLMATH_H
|
||||
#error "Please include llmath.h before this file."
|
||||
#endif
|
||||
|
||||
#if ( ( LL_DARWIN || LL_LINUX ) && !(__SSE2__) ) || ( LL_WINDOWS && ( _M_IX86_FP < 2 ) )
|
||||
#error SSE2 not enabled. LLVector4a and related class will not compile.
|
||||
#endif
|
||||
|
||||
#if !LL_WINDOWS
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
|
||||
{
|
||||
return reinterpret_cast<T*>(
|
||||
(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
|
||||
}
|
||||
|
||||
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
|
||||
{
|
||||
return reinterpret_cast<T*>(
|
||||
(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
|
||||
}
|
||||
|
||||
#if LL_LINUX || LL_DARWIN
|
||||
|
||||
#define LL_ALIGN_PREFIX(x)
|
||||
#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x)))
|
||||
|
||||
#elif LL_WINDOWS
|
||||
|
||||
#define LL_ALIGN_PREFIX(x) __declspec(align(x))
|
||||
#define LL_ALIGN_POSTFIX(x)
|
||||
|
||||
#else
|
||||
#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
|
||||
#endif
|
||||
|
||||
#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
|
||||
|
||||
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
#include "llsimdtypes.h"
|
||||
#include "llsimdtypes.inl"
|
||||
|
||||
class LLMatrix3a;
|
||||
class LLRotation;
|
||||
class LLMatrix3;
|
||||
|
||||
#include "llquaternion.h"
|
||||
|
||||
#include "llvector4logical.h"
|
||||
#include "llvector4a.h"
|
||||
#include "llmatrix3a.h"
|
||||
#include "llquaternion2.h"
|
||||
#include "llvector4a.inl"
|
||||
#include "llmatrix3a.inl"
|
||||
#include "llquaternion2.inl"
|
||||
|
||||
|
||||
#endif //LL_SIMD_MATH_H
|
||||
124
indra/llmath/llsimdtypes.h
Normal file
124
indra/llmath/llsimdtypes.h
Normal file
@@ -0,0 +1,124 @@
|
||||
/**
|
||||
* @file llsimdtypes.h
|
||||
* @brief Declaration of basic SIMD math related types
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_SIMD_TYPES_H
|
||||
#define LL_SIMD_TYPES_H
|
||||
|
||||
#ifndef LL_SIMD_MATH_H
|
||||
#error "Please include llmath.h before this file."
|
||||
#endif
|
||||
|
||||
typedef __m128 LLQuad;
|
||||
|
||||
|
||||
#if LL_WINDOWS
|
||||
#pragma warning(push)
|
||||
#pragma warning( disable : 4800 3 ) // Disable warning about casting int to bool for this class.
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1500)
|
||||
// VC++ 2005 is missing these intrinsics
|
||||
// __forceinline is MSVC specific and attempts to override compiler inlining judgment. This is so
|
||||
// even in debug builds this call is a NOP.
|
||||
__forceinline const __m128 _mm_castsi128_ps( const __m128i a ) { return reinterpret_cast<const __m128&>(a); }
|
||||
__forceinline const __m128i _mm_castps_si128( const __m128 a ) { return reinterpret_cast<const __m128i&>(a); }
|
||||
#endif // _MSC_VER
|
||||
|
||||
#endif // LL_WINDOWS
|
||||
|
||||
class LLBool32
|
||||
{
|
||||
public:
|
||||
inline LLBool32() {}
|
||||
inline LLBool32(int rhs) : m_bool(rhs) {}
|
||||
inline LLBool32(unsigned int rhs) : m_bool(rhs) {}
|
||||
inline LLBool32(bool rhs) { m_bool = static_cast<const int>(rhs); }
|
||||
inline LLBool32& operator= (bool rhs) { m_bool = (int)rhs; return *this; }
|
||||
inline bool operator== (bool rhs) const { return static_cast<const bool&>(m_bool) == rhs; }
|
||||
inline bool operator!= (bool rhs) const { return !operator==(rhs); }
|
||||
inline operator bool() const { return static_cast<const bool&>(m_bool); }
|
||||
|
||||
private:
|
||||
int m_bool;
|
||||
};
|
||||
|
||||
#if LL_WINDOWS
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
class LLSimdScalar
|
||||
{
|
||||
public:
|
||||
inline LLSimdScalar() {}
|
||||
inline LLSimdScalar(LLQuad q)
|
||||
{
|
||||
mQ = q;
|
||||
}
|
||||
|
||||
inline LLSimdScalar(F32 f)
|
||||
{
|
||||
mQ = _mm_set_ss(f);
|
||||
}
|
||||
|
||||
static inline const LLSimdScalar& getZero()
|
||||
{
|
||||
extern const LLQuad F_ZERO_4A;
|
||||
return reinterpret_cast<const LLSimdScalar&>(F_ZERO_4A);
|
||||
}
|
||||
|
||||
inline F32 getF32() const;
|
||||
|
||||
inline LLBool32 isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance = F_APPROXIMATELY_ZERO) const;
|
||||
|
||||
inline LLSimdScalar getAbs() const;
|
||||
|
||||
inline void setMax( const LLSimdScalar& a, const LLSimdScalar& b );
|
||||
|
||||
inline void setMin( const LLSimdScalar& a, const LLSimdScalar& b );
|
||||
|
||||
inline LLSimdScalar& operator=(F32 rhs);
|
||||
|
||||
inline LLSimdScalar& operator+=(const LLSimdScalar& rhs);
|
||||
|
||||
inline LLSimdScalar& operator-=(const LLSimdScalar& rhs);
|
||||
|
||||
inline LLSimdScalar& operator*=(const LLSimdScalar& rhs);
|
||||
|
||||
inline LLSimdScalar& operator/=(const LLSimdScalar& rhs);
|
||||
|
||||
inline operator LLQuad() const
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
|
||||
inline const LLQuad& getQuad() const
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
|
||||
private:
|
||||
LLQuad mQ;
|
||||
};
|
||||
|
||||
#endif //LL_SIMD_TYPES_H
|
||||
157
indra/llmath/llsimdtypes.inl
Normal file
157
indra/llmath/llsimdtypes.inl
Normal file
@@ -0,0 +1,157 @@
|
||||
/**
|
||||
* @file llsimdtypes.inl
|
||||
* @brief Inlined definitions of basic SIMD math related types
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////
|
||||
// LLSimdScalar
|
||||
//////////////////
|
||||
|
||||
inline LLSimdScalar operator+(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
LLSimdScalar t(a);
|
||||
t += b;
|
||||
return t;
|
||||
}
|
||||
|
||||
inline LLSimdScalar operator-(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
LLSimdScalar t(a);
|
||||
t -= b;
|
||||
return t;
|
||||
}
|
||||
|
||||
inline LLSimdScalar operator*(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
LLSimdScalar t(a);
|
||||
t *= b;
|
||||
return t;
|
||||
}
|
||||
|
||||
inline LLSimdScalar operator/(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
LLSimdScalar t(a);
|
||||
t /= b;
|
||||
return t;
|
||||
}
|
||||
|
||||
inline LLSimdScalar operator-(const LLSimdScalar& a)
|
||||
{
|
||||
static LL_ALIGN_16(const U32 signMask[4]) = {0x80000000, 0x80000000, 0x80000000, 0x80000000 };
|
||||
return _mm_xor_ps(*reinterpret_cast<const LLQuad*>(signMask), a);
|
||||
}
|
||||
|
||||
inline LLBool32 operator==(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
return _mm_comieq_ss(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 operator!=(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
return _mm_comineq_ss(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 operator<(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
return _mm_comilt_ss(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 operator<=(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
return _mm_comile_ss(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 operator>(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
return _mm_comigt_ss(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 operator>=(const LLSimdScalar& a, const LLSimdScalar& b)
|
||||
{
|
||||
return _mm_comige_ss(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 LLSimdScalar::isApproximatelyEqual(const LLSimdScalar& rhs, F32 tolerance /* = F_APPROXIMATELY_ZERO */) const
|
||||
{
|
||||
const LLSimdScalar tol( tolerance );
|
||||
const LLSimdScalar diff = _mm_sub_ss( mQ, rhs.mQ );
|
||||
const LLSimdScalar absDiff = diff.getAbs();
|
||||
return absDiff <= tol;
|
||||
}
|
||||
|
||||
inline void LLSimdScalar::setMax( const LLSimdScalar& a, const LLSimdScalar& b )
|
||||
{
|
||||
mQ = _mm_max_ss( a, b );
|
||||
}
|
||||
|
||||
inline void LLSimdScalar::setMin( const LLSimdScalar& a, const LLSimdScalar& b )
|
||||
{
|
||||
mQ = _mm_min_ss( a, b );
|
||||
}
|
||||
|
||||
inline LLSimdScalar& LLSimdScalar::operator=(F32 rhs)
|
||||
{
|
||||
mQ = _mm_set_ss(rhs);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLSimdScalar& LLSimdScalar::operator+=(const LLSimdScalar& rhs)
|
||||
{
|
||||
mQ = _mm_add_ss( mQ, rhs );
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLSimdScalar& LLSimdScalar::operator-=(const LLSimdScalar& rhs)
|
||||
{
|
||||
mQ = _mm_sub_ss( mQ, rhs );
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLSimdScalar& LLSimdScalar::operator*=(const LLSimdScalar& rhs)
|
||||
{
|
||||
mQ = _mm_mul_ss( mQ, rhs );
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLSimdScalar& LLSimdScalar::operator/=(const LLSimdScalar& rhs)
|
||||
{
|
||||
mQ = _mm_div_ss( mQ, rhs );
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLSimdScalar LLSimdScalar::getAbs() const
|
||||
{
|
||||
static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
|
||||
return _mm_and_ps( mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
|
||||
}
|
||||
|
||||
inline F32 LLSimdScalar::getF32() const
|
||||
{
|
||||
F32 ret;
|
||||
_mm_store_ss(&ret, mQ);
|
||||
return ret;
|
||||
}
|
||||
222
indra/llmath/llvector4a.cpp
Normal file
222
indra/llmath/llvector4a.cpp
Normal file
@@ -0,0 +1,222 @@
|
||||
/**
|
||||
* @file llvector4a.cpp
|
||||
* @brief SIMD vector implementation
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#include "llmath.h"
|
||||
#include "llquantize.h"
|
||||
|
||||
extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 };
|
||||
extern const LLQuad F_APPROXIMATELY_ZERO_4A = {
|
||||
F_APPROXIMATELY_ZERO,
|
||||
F_APPROXIMATELY_ZERO,
|
||||
F_APPROXIMATELY_ZERO,
|
||||
F_APPROXIMATELY_ZERO
|
||||
};
|
||||
|
||||
extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A );
|
||||
extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A );
|
||||
|
||||
/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
|
||||
{
|
||||
assert(src != NULL);
|
||||
assert(dst != NULL);
|
||||
assert(bytes > 0);
|
||||
assert((bytes % sizeof(F32))== 0);
|
||||
|
||||
F32* end = dst + (bytes / sizeof(F32) );
|
||||
|
||||
if (bytes > 64)
|
||||
{
|
||||
F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
|
||||
|
||||
//at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
|
||||
F32* end_64 = end-16;
|
||||
|
||||
_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
|
||||
|
||||
while (dst < begin_64)
|
||||
{
|
||||
copy4a(dst, src);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
}
|
||||
|
||||
while (dst < end_64)
|
||||
{
|
||||
_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
|
||||
copy4a(dst, src);
|
||||
copy4a(dst+4, src+4);
|
||||
copy4a(dst+8, src+8);
|
||||
copy4a(dst+12, src+12);
|
||||
|
||||
dst += 16;
|
||||
src += 16;
|
||||
}
|
||||
}
|
||||
|
||||
while (dst < end)
|
||||
{
|
||||
copy4a(dst, src);
|
||||
dst += 4;
|
||||
src += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )
|
||||
{
|
||||
const LLVector4a col0 = rot.getColumn(0);
|
||||
const LLVector4a col1 = rot.getColumn(1);
|
||||
const LLVector4a col2 = rot.getColumn(2);
|
||||
|
||||
LLVector4a result = _mm_load_ss( vec.getF32ptr() );
|
||||
result.splat<0>( result );
|
||||
result.mul( col0 );
|
||||
|
||||
{
|
||||
LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 );
|
||||
yyyy.splat<0>( yyyy );
|
||||
yyyy.mul( col1 );
|
||||
result.add( yyyy );
|
||||
}
|
||||
|
||||
{
|
||||
LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 );
|
||||
zzzz.splat<0>( zzzz );
|
||||
zzzz.mul( col2 );
|
||||
result.add( zzzz );
|
||||
}
|
||||
|
||||
*this = result;
|
||||
}
|
||||
|
||||
void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec )
|
||||
{
|
||||
const LLVector4a& quatVec = quat.getVector4a();
|
||||
LLVector4a temp; temp.setCross3(quatVec, vec);
|
||||
temp.add( temp );
|
||||
|
||||
const LLVector4a realPart( quatVec.getScalarAt<3>() );
|
||||
LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart );
|
||||
|
||||
mQ = vec;
|
||||
add( tempTimesReal );
|
||||
|
||||
LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp );
|
||||
add(imagCrossTemp);
|
||||
}
|
||||
|
||||
void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high )
|
||||
{
|
||||
LLVector4a val(mQ);
|
||||
LLVector4a delta; delta.setSub( high, low );
|
||||
|
||||
{
|
||||
val.clamp(low, high);
|
||||
val.sub(low);
|
||||
|
||||
// 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
|
||||
const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
|
||||
// {
|
||||
// static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
|
||||
// LLVector4a two; two.load4a( F_TWO_4A );
|
||||
//
|
||||
// // Here we use _mm_rcp_ps plus one round of newton-raphson
|
||||
// // We wish to find 'x' such that x = 1/delta
|
||||
// // As a first approximation, we take x0 = _mm_rcp_ps(delta)
|
||||
// // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
|
||||
// // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
|
||||
// const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
|
||||
// oneOverDelta.setMul( delta, recipApprox );
|
||||
// oneOverDelta.setSub( two, oneOverDelta );
|
||||
// oneOverDelta.mul( recipApprox );
|
||||
// }
|
||||
|
||||
val.mul(oneOverDelta);
|
||||
val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
|
||||
}
|
||||
|
||||
val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
|
||||
|
||||
{
|
||||
val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
|
||||
val.mul(delta);
|
||||
val.add(low);
|
||||
}
|
||||
|
||||
{
|
||||
LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
|
||||
LLVector4a absVal; absVal.setAbs( val );
|
||||
setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
|
||||
}
|
||||
}
|
||||
|
||||
void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
|
||||
{
|
||||
LLVector4a val(mQ);
|
||||
LLVector4a delta; delta.setSub( high, low );
|
||||
|
||||
{
|
||||
val.clamp(low, high);
|
||||
val.sub(low);
|
||||
|
||||
// 16-bit quantization means we need a round of Newton-Raphson
|
||||
LLVector4a oneOverDelta;
|
||||
{
|
||||
static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
|
||||
LLVector4a two; two.load4a( F_TWO_4A );
|
||||
|
||||
// Here we use _mm_rcp_ps plus one round of newton-raphson
|
||||
// We wish to find 'x' such that x = 1/delta
|
||||
// As a first approximation, we take x0 = _mm_rcp_ps(delta)
|
||||
// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
|
||||
// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
|
||||
const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
|
||||
oneOverDelta.setMul( delta, recipApprox );
|
||||
oneOverDelta.setSub( two, oneOverDelta );
|
||||
oneOverDelta.mul( recipApprox );
|
||||
}
|
||||
|
||||
val.mul(oneOverDelta);
|
||||
val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
|
||||
}
|
||||
|
||||
val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
|
||||
|
||||
{
|
||||
val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
|
||||
val.mul(delta);
|
||||
val.add(low);
|
||||
}
|
||||
|
||||
{
|
||||
LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
|
||||
LLVector4a absVal; absVal.setAbs( val );
|
||||
setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
|
||||
}
|
||||
}
|
||||
324
indra/llmath/llvector4a.h
Normal file
324
indra/llmath/llvector4a.h
Normal file
@@ -0,0 +1,324 @@
|
||||
/**
|
||||
* @file llvector4a.h
|
||||
* @brief LLVector4a class header file - memory aligned and vectorized 4 component vector
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_LLVECTOR4A_H
|
||||
#define LL_LLVECTOR4A_H
|
||||
|
||||
|
||||
class LLRotation;
|
||||
|
||||
#include <assert.h>
|
||||
#include "llpreprocessor.h"
|
||||
|
||||
///////////////////////////////////
|
||||
// FIRST TIME USERS PLEASE READ
|
||||
//////////////////////////////////
|
||||
// This is just the beginning of LLVector4a. There are many more useful functions
|
||||
// yet to be implemented. For example, setNeg to negate a vector, rotate() to apply
|
||||
// a matrix rotation, various functions to manipulate only the X, Y, and Z elements
|
||||
// and many others (including a whole variety of accessors). So if you don't see a
|
||||
// function here that you need, please contact Falcon or someone else with SSE
|
||||
// experience (Richard, I think, has some and davep has a little as of the time
|
||||
// of this writing, July 08, 2010) about getting it implemented before you resort to
|
||||
// LLVector3/LLVector4.
|
||||
/////////////////////////////////
|
||||
|
||||
class LLVector4a
|
||||
{
|
||||
public:
|
||||
|
||||
///////////////////////////////////
|
||||
// STATIC METHODS
|
||||
///////////////////////////////////
|
||||
|
||||
// Call initClass() at startup to avoid 15,000+ cycle penalties from denormalized numbers
|
||||
static void initClass()
|
||||
{
|
||||
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
|
||||
}
|
||||
|
||||
// Return a vector of all zeros
|
||||
static inline const LLVector4a& getZero()
|
||||
{
|
||||
extern const LLVector4a LL_V4A_ZERO;
|
||||
return LL_V4A_ZERO;
|
||||
}
|
||||
|
||||
// Return a vector of all epsilon, where epsilon is a small float suitable for approximate equality checks
|
||||
static inline const LLVector4a& getEpsilon()
|
||||
{
|
||||
extern const LLVector4a LL_V4A_EPSILON;
|
||||
return LL_V4A_EPSILON;
|
||||
}
|
||||
|
||||
// Copy 16 bytes from src to dst. Source and destination must be 16-byte aligned
|
||||
static inline void copy4a(F32* dst, const F32* src)
|
||||
{
|
||||
_mm_store_ps(dst, _mm_load_ps(src));
|
||||
}
|
||||
|
||||
// Copy words 16-byte blocks from src to dst. Source and destination must not overlap.
|
||||
static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);
|
||||
|
||||
////////////////////////////////////
|
||||
// CONSTRUCTORS
|
||||
////////////////////////////////////
|
||||
|
||||
LLVector4a()
|
||||
{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
|
||||
}
|
||||
|
||||
LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
|
||||
{
|
||||
set(x,y,z,w);
|
||||
}
|
||||
|
||||
LLVector4a(F32 x)
|
||||
{
|
||||
splat(x);
|
||||
}
|
||||
|
||||
LLVector4a(const LLSimdScalar& x)
|
||||
{
|
||||
splat(x);
|
||||
}
|
||||
|
||||
LLVector4a(LLQuad q)
|
||||
{
|
||||
mQ = q;
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// LOAD/STORE
|
||||
////////////////////////////////////
|
||||
|
||||
// Load from 16-byte aligned src array (preferred method of loading)
|
||||
inline void load4a(const F32* src);
|
||||
|
||||
// Load from unaligned src array (NB: Significantly slower than load4a)
|
||||
inline void loadua(const F32* src);
|
||||
|
||||
// Load only three floats beginning at address 'src'. Slowest method.
|
||||
inline void load3(const F32* src);
|
||||
|
||||
// Store to a 16-byte aligned memory address
|
||||
inline void store4a(F32* dst) const;
|
||||
|
||||
////////////////////////////////////
|
||||
// BASIC GET/SET
|
||||
////////////////////////////////////
|
||||
|
||||
// Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
|
||||
inline F32* getF32ptr();
|
||||
|
||||
// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
|
||||
inline const F32* const getF32ptr() const;
|
||||
|
||||
// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
|
||||
// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
|
||||
inline F32 operator[](const S32 idx) const;
|
||||
|
||||
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
|
||||
inline LLSimdScalar getScalarAt(const S32 idx) const;
|
||||
|
||||
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
|
||||
template <int N> LL_FORCE_INLINE LLSimdScalar getScalarAt() const;
|
||||
|
||||
// Set to an x, y, z and optional w provided
|
||||
inline void set(F32 x, F32 y, F32 z, F32 w = 0.f);
|
||||
|
||||
// Set to all zeros. This is preferred to using ::getZero()
|
||||
inline void clear();
|
||||
|
||||
// Set all elements to 'x'
|
||||
inline void splat(const F32 x);
|
||||
|
||||
// Set all elements to 'x'
|
||||
inline void splat(const LLSimdScalar& x);
|
||||
|
||||
// Set all 4 elements to element N of src, with N known at compile time
|
||||
template <int N> void splat(const LLVector4a& src);
|
||||
|
||||
// Set all 4 elements to element i of v, with i NOT known at compile time
|
||||
inline void splat(const LLVector4a& v, U32 i);
|
||||
|
||||
// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
|
||||
inline void setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse );
|
||||
|
||||
////////////////////////////////////
|
||||
// ALGEBRAIC
|
||||
////////////////////////////////////
|
||||
|
||||
// Set this to the element-wise (a + b)
|
||||
inline void setAdd(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Set this to element-wise (a - b)
|
||||
inline void setSub(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Set this to element-wise multiply (a * b)
|
||||
inline void setMul(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Set this to element-wise quotient (a / b)
|
||||
inline void setDiv(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Set this to the element-wise absolute value of src
|
||||
inline void setAbs(const LLVector4a& src);
|
||||
|
||||
// Add to each component in this vector the corresponding component in rhs
|
||||
inline void add(const LLVector4a& rhs);
|
||||
|
||||
// Subtract from each component in this vector the corresponding component in rhs
|
||||
inline void sub(const LLVector4a& rhs);
|
||||
|
||||
// Multiply each component in this vector by the corresponding component in rhs
|
||||
inline void mul(const LLVector4a& rhs);
|
||||
|
||||
// Divide each component in this vector by the corresponding component in rhs
|
||||
inline void div(const LLVector4a& rhs);
|
||||
|
||||
// Multiply this vector by x in a scalar fashion
|
||||
inline void mul(const F32 x);
|
||||
|
||||
// Set this to (a x b) (geometric cross-product)
|
||||
inline void setCross3(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Set all elements to the dot product of the x, y, and z elements in a and b
|
||||
inline void setAllDot3(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Set all elements to the dot product of the x, y, z, and w elements in a and b
|
||||
inline void setAllDot4(const LLVector4a& a, const LLVector4a& b);
|
||||
|
||||
// Return the 3D dot product of this vector and b
|
||||
inline LLSimdScalar dot3(const LLVector4a& b) const;
|
||||
|
||||
// Return the 4D dot product of this vector and b
|
||||
inline LLSimdScalar dot4(const LLVector4a& b) const;
|
||||
|
||||
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
|
||||
// Note that this does not consider zero length vectors!
|
||||
inline void normalize3();
|
||||
|
||||
// Same as normalize3() but with respect to all 4 components
|
||||
inline void normalize4();
|
||||
|
||||
// Same as normalize3(), but returns length as a SIMD scalar
|
||||
inline LLSimdScalar normalize3withLength();
|
||||
|
||||
// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
|
||||
// Note that this does not consider zero length vectors!
|
||||
inline void normalize3fast();
|
||||
|
||||
// Return true if this vector is normalized with respect to x,y,z up to tolerance
|
||||
inline LLBool32 isNormalized3( F32 tolerance = 1e-3 ) const;
|
||||
|
||||
// Return true if this vector is normalized with respect to all components up to tolerance
|
||||
inline LLBool32 isNormalized4( F32 tolerance = 1e-3 ) const;
|
||||
|
||||
// Set all elements to the length of vector 'v'
|
||||
inline void setAllLength3( const LLVector4a& v );
|
||||
|
||||
// Get this vector's length
|
||||
inline LLSimdScalar getLength3() const;
|
||||
|
||||
// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
|
||||
inline void setMin(const LLVector4a& lhs, const LLVector4a& rhs);
|
||||
|
||||
// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
|
||||
inline void setMax(const LLVector4a& lhs, const LLVector4a& rhs);
|
||||
|
||||
// Clamps this vector to be within the component-wise range low to high (inclusive)
|
||||
inline void clamp( const LLVector4a& low, const LLVector4a& high );
|
||||
|
||||
// Set this to (c * lhs) + rhs * ( 1 - c)
|
||||
inline void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c);
|
||||
|
||||
// Return true (nonzero) if x, y, z (and w for Finite4) are all finite floats
|
||||
inline LLBool32 isFinite3() const;
|
||||
inline LLBool32 isFinite4() const;
|
||||
|
||||
// Set this vector to 'vec' rotated by the LLRotation or LLQuaternion2 provided
|
||||
void setRotated( const LLRotation& rot, const LLVector4a& vec );
|
||||
void setRotated( const class LLQuaternion2& quat, const LLVector4a& vec );
|
||||
|
||||
// Set this vector to 'vec' rotated by the INVERSE of the LLRotation or LLQuaternion2 provided
|
||||
inline void setRotatedInv( const LLRotation& rot, const LLVector4a& vec );
|
||||
inline void setRotatedInv( const class LLQuaternion2& quat, const LLVector4a& vec );
|
||||
|
||||
// Quantize this vector to 8 or 16 bit precision
|
||||
void quantize8( const LLVector4a& low, const LLVector4a& high );
|
||||
void quantize16( const LLVector4a& low, const LLVector4a& high );
|
||||
|
||||
////////////////////////////////////
|
||||
// LOGICAL
|
||||
////////////////////////////////////
|
||||
// The functions in this section will compare the elements in this vector
|
||||
// to those in rhs and return an LLVector4Logical with all bits set in elements
|
||||
// where the comparison was true and all bits unset in elements where the comparison
|
||||
// was false. See llvector4logica.h
|
||||
////////////////////////////////////
|
||||
// WARNING: Other than equals3 and equals4, these functions do NOT account
|
||||
// for floating point tolerance. You should include the appropriate tolerance
|
||||
// in the inputs.
|
||||
////////////////////////////////////
|
||||
|
||||
inline LLVector4Logical greaterThan(const LLVector4a& rhs) const;
|
||||
|
||||
inline LLVector4Logical lessThan(const LLVector4a& rhs) const;
|
||||
|
||||
inline LLVector4Logical greaterEqual(const LLVector4a& rhs) const;
|
||||
|
||||
inline LLVector4Logical lessEqual(const LLVector4a& rhs) const;
|
||||
|
||||
inline LLVector4Logical equal(const LLVector4a& rhs) const;
|
||||
|
||||
// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
|
||||
inline bool equals4(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
|
||||
|
||||
inline bool equals3(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
|
||||
|
||||
////////////////////////////////////
|
||||
// OPERATORS
|
||||
////////////////////////////////////
|
||||
|
||||
// Do NOT add aditional operators without consulting someone with SSE experience
|
||||
inline const LLVector4a& operator= ( const LLVector4a& rhs );
|
||||
|
||||
inline const LLVector4a& operator= ( const LLQuad& rhs );
|
||||
|
||||
inline operator LLQuad() const;
|
||||
|
||||
private:
|
||||
LLQuad mQ;
|
||||
};
|
||||
|
||||
inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
|
||||
{
|
||||
min.setMin(min, p);
|
||||
max.setMax(max, p);
|
||||
}
|
||||
|
||||
#endif
|
||||
593
indra/llmath/llvector4a.inl
Normal file
593
indra/llmath/llvector4a.inl
Normal file
@@ -0,0 +1,593 @@
|
||||
/**
|
||||
* @file llvector4a.inl
|
||||
* @brief LLVector4a inline function implementations
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
////////////////////////////////////
|
||||
// LOAD/STORE
|
||||
////////////////////////////////////
|
||||
|
||||
// Load from 16-byte aligned src array (preferred method of loading)
|
||||
inline void LLVector4a::load4a(const F32* src)
|
||||
{
|
||||
mQ = _mm_load_ps(src);
|
||||
}
|
||||
|
||||
// Load from unaligned src array (NB: Significantly slower than load4a)
|
||||
inline void LLVector4a::loadua(const F32* src)
|
||||
{
|
||||
mQ = _mm_loadu_ps(src);
|
||||
}
|
||||
|
||||
// Load only three floats beginning at address 'src'. Slowest method.
|
||||
inline void LLVector4a::load3(const F32* src)
|
||||
{
|
||||
// mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
|
||||
// NB: This differs from the convention of { Z, Y, X, W }
|
||||
mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
|
||||
}
|
||||
|
||||
// Store to a 16-byte aligned memory address
|
||||
inline void LLVector4a::store4a(F32* dst) const
|
||||
{
|
||||
_mm_store_ps(dst, mQ);
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// BASIC GET/SET
|
||||
////////////////////////////////////
|
||||
|
||||
// Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
|
||||
F32* LLVector4a::getF32ptr()
|
||||
{
|
||||
return (F32*) &mQ;
|
||||
}
|
||||
|
||||
// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
|
||||
const F32* const LLVector4a::getF32ptr() const
|
||||
{
|
||||
return (const F32* const) &mQ;
|
||||
}
|
||||
|
||||
// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
|
||||
// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
|
||||
inline F32 LLVector4a::operator[](const S32 idx) const
|
||||
{
|
||||
return ((F32*)&mQ)[idx];
|
||||
}
|
||||
|
||||
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
|
||||
inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const
|
||||
{
|
||||
// Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop)
|
||||
switch (idx)
|
||||
{
|
||||
case 0:
|
||||
return mQ;
|
||||
case 1:
|
||||
return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
case 2:
|
||||
return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
case 3:
|
||||
default:
|
||||
return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
}
|
||||
|
||||
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
|
||||
template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const
|
||||
{
|
||||
return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
|
||||
}
|
||||
|
||||
template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
|
||||
// Set to an x, y, z and optional w provided
|
||||
inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
|
||||
{
|
||||
mQ = _mm_set_ps(w, z, y, x);
|
||||
}
|
||||
|
||||
// Set to all zeros
|
||||
inline void LLVector4a::clear()
|
||||
{
|
||||
mQ = LLVector4a::getZero().mQ;
|
||||
}
|
||||
|
||||
inline void LLVector4a::splat(const F32 x)
|
||||
{
|
||||
mQ = _mm_set1_ps(x);
|
||||
}
|
||||
|
||||
inline void LLVector4a::splat(const LLSimdScalar& x)
|
||||
{
|
||||
mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) );
|
||||
}
|
||||
|
||||
// Set all 4 elements to element N of src, with N known at compile time
|
||||
template <int N> void LLVector4a::splat(const LLVector4a& src)
|
||||
{
|
||||
mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) );
|
||||
}
|
||||
|
||||
// Set all 4 elements to element i of v, with i NOT known at compile time
|
||||
inline void LLVector4a::splat(const LLVector4a& v, U32 i)
|
||||
{
|
||||
switch (i)
|
||||
{
|
||||
case 0:
|
||||
mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
break;
|
||||
case 1:
|
||||
mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
break;
|
||||
case 2:
|
||||
mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
break;
|
||||
case 3:
|
||||
mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
|
||||
inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse )
|
||||
{
|
||||
// ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse )
|
||||
// E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b
|
||||
// (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b,
|
||||
// as expected (01 from sourceIfTrue, 10 from sourceIfFalse)
|
||||
// Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
|
||||
mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) );
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// ALGEBRAIC
|
||||
////////////////////////////////////
|
||||
|
||||
// Set this to the element-wise (a + b)
|
||||
inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
mQ = _mm_add_ps(a.mQ, b.mQ);
|
||||
}
|
||||
|
||||
// Set this to element-wise (a - b)
|
||||
inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
mQ = _mm_sub_ps(a.mQ, b.mQ);
|
||||
}
|
||||
|
||||
// Set this to element-wise multiply (a * b)
|
||||
inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
mQ = _mm_mul_ps(a.mQ, b.mQ);
|
||||
}
|
||||
|
||||
// Set this to element-wise quotient (a / b)
|
||||
inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
mQ = _mm_div_ps( a.mQ, b.mQ );
|
||||
}
|
||||
|
||||
// Set this to the element-wise absolute value of src
|
||||
inline void LLVector4a::setAbs(const LLVector4a& src)
|
||||
{
|
||||
static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
|
||||
mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
|
||||
}
|
||||
|
||||
// Add to each component in this vector the corresponding component in rhs
|
||||
inline void LLVector4a::add(const LLVector4a& rhs)
|
||||
{
|
||||
mQ = _mm_add_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Subtract from each component in this vector the corresponding component in rhs
|
||||
inline void LLVector4a::sub(const LLVector4a& rhs)
|
||||
{
|
||||
mQ = _mm_sub_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Multiply each component in this vector by the corresponding component in rhs
|
||||
inline void LLVector4a::mul(const LLVector4a& rhs)
|
||||
{
|
||||
mQ = _mm_mul_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Divide each component in this vector by the corresponding component in rhs
|
||||
inline void LLVector4a::div(const LLVector4a& rhs)
|
||||
{
|
||||
// TODO: Check accuracy, maybe add divFast
|
||||
mQ = _mm_div_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Multiply this vector by x in a scalar fashion
|
||||
inline void LLVector4a::mul(const F32 x)
|
||||
{
|
||||
LLVector4a t;
|
||||
t.splat(x);
|
||||
|
||||
mQ = _mm_mul_ps(mQ, t.mQ);
|
||||
}
|
||||
|
||||
// Set this to (a x b) (geometric cross-product)
|
||||
inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
// Vectors are stored in memory in w, z, y, x order from high to low
|
||||
// Set vector1 = { a[W], a[X], a[Z], a[Y] }
|
||||
const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
|
||||
// Set vector2 = { b[W], b[Y], b[X], b[Z] }
|
||||
const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
|
||||
// mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
|
||||
mQ = _mm_mul_ps( vector1, vector2 );
|
||||
// vector3 = { a[W], a[Y], a[X], a[Z] }
|
||||
const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
|
||||
// vector4 = { b[W], b[X], b[Z], b[Y] }
|
||||
const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
|
||||
// mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
|
||||
mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
|
||||
}
|
||||
|
||||
/* This function works, but may be slightly slower than the one below on older machines
|
||||
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
|
||||
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
|
||||
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
|
||||
const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
|
||||
// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
|
||||
const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
|
||||
// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
|
||||
const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
|
||||
// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
|
||||
const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
|
||||
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
|
||||
mQ = _mm_add_ps(zSplat, xPlusYSplat);
|
||||
}*/
|
||||
|
||||
// Set all elements to the dot product of the x, y, and z elements in a and b
|
||||
inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
|
||||
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
|
||||
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
|
||||
const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 ));
|
||||
// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
|
||||
const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
|
||||
// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
|
||||
const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
|
||||
// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
|
||||
const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
|
||||
// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
|
||||
mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
|
||||
}
|
||||
|
||||
// Set all elements to the dot product of the x, y, z, and w elements in a and b
|
||||
inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
|
||||
{
|
||||
// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
|
||||
const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
|
||||
// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
|
||||
const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 ));
|
||||
// zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
|
||||
const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
|
||||
// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
|
||||
const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
|
||||
const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
|
||||
|
||||
// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
|
||||
mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
|
||||
}
|
||||
|
||||
// Return the 3D dot product of this vector and b
|
||||
inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
|
||||
{
|
||||
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
|
||||
const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
|
||||
const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
|
||||
const LLQuad xPlusY = _mm_add_ps( ab, splatY );
|
||||
return _mm_add_ps( xPlusY, splatZ );
|
||||
}
|
||||
|
||||
// Return the 4D dot product of this vector and b
|
||||
inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
|
||||
{
|
||||
// ab = { w, z, y, x }
|
||||
const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
|
||||
// upperProdsInLowerElems = { y, x, y, x }
|
||||
const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab );
|
||||
// sumOfPairs = { w+y, z+x, 2y, 2x }
|
||||
const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab );
|
||||
// shuffled = { z+x, z+x, z+x, z+x }
|
||||
const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
|
||||
return _mm_add_ss( sumOfPairs, shuffled );
|
||||
}
|
||||
|
||||
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
|
||||
// Note that this does not consider zero length vectors!
|
||||
inline void LLVector4a::normalize3()
|
||||
{
|
||||
// lenSqrd = a dot a
|
||||
LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
|
||||
// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
|
||||
const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
|
||||
static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
|
||||
// Now we do one round of Newton-Raphson approximation to get full accuracy
|
||||
// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
|
||||
// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
|
||||
// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
|
||||
// = 0.5 * w * (3 - a*w^2)
|
||||
// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
|
||||
// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
|
||||
const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
|
||||
const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
|
||||
const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
|
||||
const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
|
||||
mQ = _mm_mul_ps( mQ, nrApprox );
|
||||
}
|
||||
|
||||
// Normalize this vector with respect to all components. Accurate to 22 bites of precision.
|
||||
// Note that this does not consider zero length vectors!
|
||||
inline void LLVector4a::normalize4()
|
||||
{
|
||||
// lenSqrd = a dot a
|
||||
LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
|
||||
// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
|
||||
const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
|
||||
static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
|
||||
// Now we do one round of Newton-Raphson approximation to get full accuracy
|
||||
// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
|
||||
// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
|
||||
// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
|
||||
// = 0.5 * w * (3 - a*w^2)
|
||||
// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
|
||||
// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
|
||||
const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
|
||||
const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
|
||||
const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
|
||||
const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
|
||||
mQ = _mm_mul_ps( mQ, nrApprox );
|
||||
}
|
||||
|
||||
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
|
||||
// Note that this does not consider zero length vectors!
|
||||
inline LLSimdScalar LLVector4a::normalize3withLength()
|
||||
{
|
||||
// lenSqrd = a dot a
|
||||
LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
|
||||
// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
|
||||
const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
|
||||
static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
|
||||
// Now we do one round of Newton-Raphson approximation to get full accuracy
|
||||
// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
|
||||
// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
|
||||
// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
|
||||
// = 0.5 * w * (3 - a*w^2)
|
||||
// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
|
||||
// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
|
||||
const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
|
||||
const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
|
||||
const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
|
||||
const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
|
||||
mQ = _mm_mul_ps( mQ, nrApprox );
|
||||
return _mm_sqrt_ss(lenSqrd);
|
||||
}
|
||||
|
||||
// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
|
||||
// Note that this does not consider zero length vectors!
|
||||
inline void LLVector4a::normalize3fast()
|
||||
{
|
||||
LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
|
||||
const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
|
||||
mQ = _mm_mul_ps( mQ, approxRsqrt );
|
||||
}
|
||||
|
||||
// Return true if this vector is normalized with respect to x,y,z up to tolerance
|
||||
inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
|
||||
{
|
||||
static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
|
||||
LLSimdScalar tol = _mm_load_ss( &tolerance );
|
||||
tol = _mm_mul_ss( tol, tol );
|
||||
LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
|
||||
lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
|
||||
lenSquared.setAbs(lenSquared);
|
||||
return _mm_comile_ss( lenSquared, tol );
|
||||
}
|
||||
|
||||
// Return true if this vector is normalized with respect to all components up to tolerance
|
||||
inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
|
||||
{
|
||||
static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
|
||||
LLSimdScalar tol = _mm_load_ss( &tolerance );
|
||||
tol = _mm_mul_ss( tol, tol );
|
||||
LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
|
||||
lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
|
||||
lenSquared.setAbs(lenSquared);
|
||||
return _mm_comile_ss( lenSquared, tol );
|
||||
}
|
||||
|
||||
// Set all elements to the length of vector 'v'
|
||||
inline void LLVector4a::setAllLength3( const LLVector4a& v )
|
||||
{
|
||||
LLVector4a lenSqrd;
|
||||
lenSqrd.setAllDot3(v, v);
|
||||
|
||||
mQ = _mm_sqrt_ps(lenSqrd.mQ);
|
||||
}
|
||||
|
||||
// Get this vector's length
|
||||
inline LLSimdScalar LLVector4a::getLength3() const
|
||||
{
|
||||
return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) );
|
||||
}
|
||||
|
||||
// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
|
||||
inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs)
|
||||
{
|
||||
mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
|
||||
inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs)
|
||||
{
|
||||
mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Set this to (c * lhs) + rhs * ( 1 - c)
|
||||
inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
|
||||
{
|
||||
LLVector4a a = lhs;
|
||||
a.mul(c);
|
||||
|
||||
LLVector4a b = rhs;
|
||||
b.mul(1.f-c);
|
||||
|
||||
setAdd(a, b);
|
||||
}
|
||||
|
||||
inline LLBool32 LLVector4a::isFinite3() const
|
||||
{
|
||||
static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
|
||||
const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
|
||||
const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
|
||||
const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
|
||||
return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ );
|
||||
}
|
||||
|
||||
inline LLBool32 LLVector4a::isFinite4() const
|
||||
{
|
||||
static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
|
||||
const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
|
||||
const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
|
||||
const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
|
||||
return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW );
|
||||
}
|
||||
|
||||
inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec )
|
||||
{
|
||||
LLRotation inv; inv.setTranspose( rot );
|
||||
setRotated( inv, vec );
|
||||
}
|
||||
|
||||
inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec )
|
||||
{
|
||||
LLQuaternion2 invRot; invRot.setConjugate( quat );
|
||||
setRotated(invRot, vec);
|
||||
}
|
||||
|
||||
inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
|
||||
{
|
||||
const LLVector4Logical highMask = greaterThan( high );
|
||||
const LLVector4Logical lowMask = lessThan( low );
|
||||
|
||||
setSelectWithMask( highMask, high, *this );
|
||||
setSelectWithMask( lowMask, low, *this );
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////
|
||||
// LOGICAL
|
||||
////////////////////////////////////
|
||||
// The functions in this section will compare the elements in this vector
|
||||
// to those in rhs and return an LLVector4Logical with all bits set in elements
|
||||
// where the comparison was true and all bits unset in elements where the comparison
|
||||
// was false. See llvector4logica.h
|
||||
////////////////////////////////////
|
||||
// WARNING: Other than equals3 and equals4, these functions do NOT account
|
||||
// for floating point tolerance. You should include the appropriate tolerance
|
||||
// in the inputs.
|
||||
////////////////////////////////////
|
||||
|
||||
inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const
|
||||
{
|
||||
return _mm_cmpgt_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const
|
||||
{
|
||||
return _mm_cmplt_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const
|
||||
{
|
||||
return _mm_cmpge_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const
|
||||
{
|
||||
return _mm_cmple_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const
|
||||
{
|
||||
return _mm_cmpeq_ps(mQ, rhs.mQ);
|
||||
}
|
||||
|
||||
// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
|
||||
inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const
|
||||
{
|
||||
LLVector4a diff; diff.setSub( *this, rhs );
|
||||
diff.setAbs( diff );
|
||||
const LLQuad tol = _mm_set1_ps( tolerance );
|
||||
const LLQuad cmp = _mm_cmplt_ps( diff, tol );
|
||||
return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW;
|
||||
}
|
||||
|
||||
inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const
|
||||
{
|
||||
LLVector4a diff; diff.setSub( *this, rhs );
|
||||
diff.setAbs( diff );
|
||||
const LLQuad tol = _mm_set1_ps( tolerance );
|
||||
const LLQuad t = _mm_cmplt_ps( diff, tol );
|
||||
return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ;
|
||||
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// OPERATORS
|
||||
////////////////////////////////////
|
||||
|
||||
// Do NOT add aditional operators without consulting someone with SSE experience
|
||||
inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs )
|
||||
{
|
||||
mQ = rhs.mQ;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs )
|
||||
{
|
||||
mQ = rhs;
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLVector4a::operator LLQuad() const
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
124
indra/llmath/llvector4logical.h
Normal file
124
indra/llmath/llvector4logical.h
Normal file
@@ -0,0 +1,124 @@
|
||||
/**
|
||||
* @file llvector4logical.h
|
||||
* @brief LLVector4Logical class header file - Companion class to LLVector4a for logical and bit-twiddling operations
|
||||
*
|
||||
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_VECTOR4LOGICAL_H
|
||||
#define LL_VECTOR4LOGICAL_H
|
||||
|
||||
|
||||
////////////////////////////
|
||||
// LLVector4Logical
|
||||
////////////////////////////
|
||||
// This class is incomplete. If you need additional functionality,
|
||||
// for example setting/unsetting particular elements or performing
|
||||
// other boolean operations, feel free to implement. If you need
|
||||
// assistance in determining the most optimal implementation,
|
||||
// contact someone with SSE experience (Falcon, Richard, Davep, e.g.)
|
||||
////////////////////////////
|
||||
|
||||
static LL_ALIGN_16(const U32 S_V4LOGICAL_MASK_TABLE[4*4]) =
|
||||
{
|
||||
0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
class LLVector4Logical
|
||||
{
|
||||
public:
|
||||
|
||||
enum {
|
||||
MASK_X = 1,
|
||||
MASK_Y = 1 << 1,
|
||||
MASK_Z = 1 << 2,
|
||||
MASK_W = 1 << 3,
|
||||
MASK_XYZ = MASK_X | MASK_Y | MASK_Z,
|
||||
MASK_XYZW = MASK_XYZ | MASK_W
|
||||
};
|
||||
|
||||
// Empty default ctor
|
||||
LLVector4Logical() {}
|
||||
|
||||
LLVector4Logical( const LLQuad& quad )
|
||||
{
|
||||
mQ = quad;
|
||||
}
|
||||
|
||||
// Create and return a mask consisting of the lowest order bit of each element
|
||||
inline U32 getGatheredBits() const
|
||||
{
|
||||
return _mm_movemask_ps(mQ);
|
||||
};
|
||||
|
||||
// Invert this mask
|
||||
inline LLVector4Logical& invert()
|
||||
{
|
||||
static const LL_ALIGN_16(U32 allOnes[4]) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
|
||||
mQ = _mm_andnot_ps( mQ, *(LLQuad*)(allOnes) );
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline LLBool32 areAllSet( U32 mask ) const
|
||||
{
|
||||
return ( getGatheredBits() & mask) == mask;
|
||||
}
|
||||
|
||||
inline LLBool32 areAllSet() const
|
||||
{
|
||||
return areAllSet( MASK_XYZW );
|
||||
}
|
||||
|
||||
inline LLBool32 areAnySet( U32 mask ) const
|
||||
{
|
||||
return getGatheredBits() & mask;
|
||||
}
|
||||
|
||||
inline LLBool32 areAnySet() const
|
||||
{
|
||||
return areAnySet( MASK_XYZW );
|
||||
}
|
||||
|
||||
inline operator LLQuad() const
|
||||
{
|
||||
return mQ;
|
||||
}
|
||||
|
||||
inline void clear()
|
||||
{
|
||||
mQ = _mm_setzero_ps();
|
||||
}
|
||||
|
||||
template<int N> void setElement()
|
||||
{
|
||||
mQ = _mm_or_ps( mQ, *reinterpret_cast<const LLQuad*>(S_V4LOGICAL_MASK_TABLE + 4*N) );
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
LLQuad mQ;
|
||||
};
|
||||
|
||||
#endif //LL_VECTOR4ALOGICAL_H
|
||||
257
indra/llmath/llvolumeoctree.cpp
Normal file
257
indra/llmath/llvolumeoctree.cpp
Normal file
@@ -0,0 +1,257 @@
|
||||
/**
|
||||
|
||||
* @file llvolumeoctree.cpp
|
||||
*
|
||||
* $LicenseInfo:firstyear=2002&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#include "llvolumeoctree.h"
|
||||
#include "llvector4a.h"
|
||||
|
||||
BOOL LLLineSegmentBoxIntersect(const LLVector4a& start, const LLVector4a& end, const LLVector4a& center, const LLVector4a& size)
|
||||
{
|
||||
LLVector4a fAWdU;
|
||||
LLVector4a dir;
|
||||
LLVector4a diff;
|
||||
|
||||
dir.setSub(end, start);
|
||||
dir.mul(0.5f);
|
||||
|
||||
diff.setAdd(end,start);
|
||||
diff.mul(0.5f);
|
||||
diff.sub(center);
|
||||
fAWdU.setAbs(dir);
|
||||
|
||||
LLVector4a rhs;
|
||||
rhs.setAdd(size, fAWdU);
|
||||
|
||||
LLVector4a lhs;
|
||||
lhs.setAbs(diff);
|
||||
|
||||
U32 grt = lhs.greaterThan(rhs).getGatheredBits();
|
||||
|
||||
if (grt & 0x7)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
LLVector4a f;
|
||||
f.setCross3(dir, diff);
|
||||
f.setAbs(f);
|
||||
|
||||
LLVector4a v0, v1;
|
||||
|
||||
v0 = _mm_shuffle_ps(size, size,_MM_SHUFFLE(3,0,0,1));
|
||||
v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,1,2,2));
|
||||
lhs.setMul(v0, v1);
|
||||
|
||||
v0 = _mm_shuffle_ps(size, size, _MM_SHUFFLE(3,1,2,2));
|
||||
v1 = _mm_shuffle_ps(fAWdU, fAWdU, _MM_SHUFFLE(3,0,0,1));
|
||||
rhs.setMul(v0, v1);
|
||||
rhs.add(lhs);
|
||||
|
||||
grt = f.greaterThan(rhs).getGatheredBits();
|
||||
|
||||
return (grt & 0x7) ? false : true;
|
||||
}
|
||||
|
||||
|
||||
#if 0 //MESH
|
||||
LLVolumeOctreeListener::LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node)
|
||||
{
|
||||
node->addListener(this);
|
||||
}
|
||||
|
||||
LLVolumeOctreeListener::~LLVolumeOctreeListener()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void LLVolumeOctreeListener::handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent,
|
||||
LLOctreeNode<LLVolumeTriangle>* child)
|
||||
{
|
||||
new LLVolumeOctreeListener(child);
|
||||
}
|
||||
|
||||
|
||||
LLOctreeTriangleRayIntersect::LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir,
|
||||
const LLVolumeFace* face, F32* closest_t,
|
||||
LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal)
|
||||
: mFace(face),
|
||||
mStart(start),
|
||||
mDir(dir),
|
||||
mIntersection(intersection),
|
||||
mTexCoord(tex_coord),
|
||||
mNormal(normal),
|
||||
mBinormal(bi_normal),
|
||||
mClosestT(closest_t),
|
||||
mHitFace(false)
|
||||
{
|
||||
mEnd.setAdd(mStart, mDir);
|
||||
}
|
||||
|
||||
void LLOctreeTriangleRayIntersect::traverse(const LLOctreeNode<LLVolumeTriangle>* node)
|
||||
{
|
||||
LLVolumeOctreeListener* vl = (LLVolumeOctreeListener*) node->getListener(0);
|
||||
|
||||
/*const F32* start = mStart.getF32();
|
||||
const F32* end = mEnd.getF32();
|
||||
const F32* center = vl->mBounds[0].getF32();
|
||||
const F32* size = vl->mBounds[1].getF32();*/
|
||||
|
||||
//if (LLLineSegmentBoxIntersect(mStart, mEnd, vl->mBounds[0], vl->mBounds[1]))
|
||||
if (LLLineSegmentBoxIntersect(mStart.getF32ptr(), mEnd.getF32ptr(), vl->mBounds[0].getF32ptr(), vl->mBounds[1].getF32ptr()))
|
||||
{
|
||||
node->accept(this);
|
||||
for (S32 i = 0; i < node->getChildCount(); ++i)
|
||||
{
|
||||
traverse(node->getChild(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void LLOctreeTriangleRayIntersect::visit(const LLOctreeNode<LLVolumeTriangle>* node)
|
||||
{
|
||||
for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter =
|
||||
node->getData().begin(); iter != node->getData().end(); ++iter)
|
||||
{
|
||||
const LLVolumeTriangle* tri = *iter;
|
||||
|
||||
F32 a, b, t;
|
||||
|
||||
if (LLTriangleRayIntersect(*tri->mV[0], *tri->mV[1], *tri->mV[2],
|
||||
mStart, mDir, a, b, t))
|
||||
{
|
||||
if ((t >= 0.f) && // if hit is after start
|
||||
(t <= 1.f) && // and before end
|
||||
(t < *mClosestT)) // and this hit is closer
|
||||
{
|
||||
*mClosestT = t;
|
||||
mHitFace = true;
|
||||
|
||||
if (mIntersection != NULL)
|
||||
{
|
||||
LLVector4a intersect = mDir;
|
||||
intersect.mul(*mClosestT);
|
||||
intersect.add(mStart);
|
||||
mIntersection->set(intersect.getF32ptr());
|
||||
}
|
||||
|
||||
|
||||
if (mTexCoord != NULL)
|
||||
{
|
||||
LLVector2* tc = (LLVector2*) mFace->mTexCoords;
|
||||
*mTexCoord = ((1.f - a - b) * tc[tri->mIndex[0]] +
|
||||
a * tc[tri->mIndex[1]] +
|
||||
b * tc[tri->mIndex[2]]);
|
||||
|
||||
}
|
||||
|
||||
if (mNormal != NULL)
|
||||
{
|
||||
LLVector4* norm = (LLVector4*) mFace->mNormals;
|
||||
|
||||
*mNormal = ((1.f - a - b) * LLVector3(norm[tri->mIndex[0]]) +
|
||||
a * LLVector3(norm[tri->mIndex[1]]) +
|
||||
b * LLVector3(norm[tri->mIndex[2]]));
|
||||
}
|
||||
|
||||
if (mBinormal != NULL)
|
||||
{
|
||||
LLVector4* binormal = (LLVector4*) mFace->mBinormals;
|
||||
*mBinormal = ((1.f - a - b) * LLVector3(binormal[tri->mIndex[0]]) +
|
||||
a * LLVector3(binormal[tri->mIndex[1]]) +
|
||||
b * LLVector3(binormal[tri->mIndex[2]]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const LLVector4a& LLVolumeTriangle::getPositionGroup() const
|
||||
{
|
||||
return mPositionGroup;
|
||||
}
|
||||
|
||||
const F32& LLVolumeTriangle::getBinRadius() const
|
||||
{
|
||||
return mRadius;
|
||||
}
|
||||
|
||||
|
||||
//TEST CODE
|
||||
|
||||
void LLVolumeOctreeValidate::visit(const LLOctreeNode<LLVolumeTriangle>* branch)
|
||||
{
|
||||
LLVolumeOctreeListener* node = (LLVolumeOctreeListener*) branch->getListener(0);
|
||||
|
||||
//make sure bounds matches extents
|
||||
LLVector4a& min = node->mExtents[0];
|
||||
LLVector4a& max = node->mExtents[1];
|
||||
|
||||
LLVector4a& center = node->mBounds[0];
|
||||
LLVector4a& size = node->mBounds[1];
|
||||
|
||||
LLVector4a test_min, test_max;
|
||||
test_min.setSub(center, size);
|
||||
test_max.setAdd(center, size);
|
||||
|
||||
if (!test_min.equals3(min, 0.001f) ||
|
||||
!test_max.equals3(max, 0.001f))
|
||||
{
|
||||
llerrs << "Bad bounding box data found." << llendl;
|
||||
}
|
||||
|
||||
test_min.sub(LLVector4a(0.001f));
|
||||
test_max.add(LLVector4a(0.001f));
|
||||
|
||||
for (U32 i = 0; i < branch->getChildCount(); ++i)
|
||||
{
|
||||
LLVolumeOctreeListener* child = (LLVolumeOctreeListener*) branch->getChild(i)->getListener(0);
|
||||
|
||||
//make sure all children fit inside this node
|
||||
if (child->mExtents[0].lessThan(test_min).areAnySet(LLVector4Logical::MASK_XYZ) ||
|
||||
child->mExtents[1].greaterThan(test_max).areAnySet(LLVector4Logical::MASK_XYZ))
|
||||
{
|
||||
llerrs << "Child protrudes from bounding box." << llendl;
|
||||
}
|
||||
}
|
||||
|
||||
//children fit, check data
|
||||
for (LLOctreeNode<LLVolumeTriangle>::const_element_iter iter = branch->getData().begin();
|
||||
iter != branch->getData().end(); ++iter)
|
||||
{
|
||||
const LLVolumeTriangle* tri = *iter;
|
||||
|
||||
//validate triangle
|
||||
for (U32 i = 0; i < 3; i++)
|
||||
{
|
||||
if (tri->mV[i]->greaterThan(test_max).areAnySet(LLVector4Logical::MASK_XYZ) ||
|
||||
tri->mV[i]->lessThan(test_min).areAnySet(LLVector4Logical::MASK_XYZ))
|
||||
{
|
||||
llerrs << "Triangle protrudes from node." << llendl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
135
indra/llmath/llvolumeoctree.h
Normal file
135
indra/llmath/llvolumeoctree.h
Normal file
@@ -0,0 +1,135 @@
|
||||
/**
|
||||
* @file llvolumeoctree.h
|
||||
* @brief LLVolume octree classes.
|
||||
*
|
||||
* $LicenseInfo:firstyear=2002&license=viewerlgpl$
|
||||
* Second Life Viewer Source Code
|
||||
* Copyright (C) 2010, Linden Research, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation;
|
||||
* version 2.1 of the License only.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
||||
* $/LicenseInfo$
|
||||
*/
|
||||
|
||||
#ifndef LL_LLVOLUME_OCTREE_H
|
||||
#define LL_LLVOLUME_OCTREE_H
|
||||
|
||||
#include "linden_common.h"
|
||||
#include "llmemory.h"
|
||||
|
||||
#include "lloctree.h"
|
||||
#include "llvolume.h"
|
||||
#include "llvector4a.h"
|
||||
|
||||
#if 0 //MESH
|
||||
class LLVolumeTriangle : public LLRefCount
|
||||
{
|
||||
public:
|
||||
LLVolumeTriangle()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
LLVolumeTriangle(const LLVolumeTriangle& rhs)
|
||||
{
|
||||
*this = rhs;
|
||||
}
|
||||
|
||||
const LLVolumeTriangle& operator=(const LLVolumeTriangle& rhs)
|
||||
{
|
||||
llerrs << "Illegal operation!" << llendl;
|
||||
return *this;
|
||||
}
|
||||
|
||||
~LLVolumeTriangle()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
LLVector4a mPositionGroup;
|
||||
|
||||
const LLVector4a* mV[3];
|
||||
U16 mIndex[3];
|
||||
|
||||
F32 mRadius;
|
||||
|
||||
virtual const LLVector4a& getPositionGroup() const;
|
||||
virtual const F32& getBinRadius() const;
|
||||
};
|
||||
|
||||
class LLVolumeOctreeListener : public LLOctreeListener<LLVolumeTriangle>
|
||||
{
|
||||
public:
|
||||
|
||||
LLVolumeOctreeListener(LLOctreeNode<LLVolumeTriangle>* node);
|
||||
~LLVolumeOctreeListener();
|
||||
|
||||
LLVolumeOctreeListener(const LLVolumeOctreeListener& rhs)
|
||||
{
|
||||
*this = rhs;
|
||||
}
|
||||
|
||||
const LLVolumeOctreeListener& operator=(const LLVolumeOctreeListener& rhs)
|
||||
{
|
||||
llerrs << "Illegal operation!" << llendl;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//LISTENER FUNCTIONS
|
||||
virtual void handleChildAddition(const LLOctreeNode<LLVolumeTriangle>* parent,
|
||||
LLOctreeNode<LLVolumeTriangle>* child);
|
||||
virtual void handleStateChange(const LLTreeNode<LLVolumeTriangle>* node) { }
|
||||
virtual void handleChildRemoval(const LLOctreeNode<LLVolumeTriangle>* parent,
|
||||
const LLOctreeNode<LLVolumeTriangle>* child) { }
|
||||
virtual void handleInsertion(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { }
|
||||
virtual void handleRemoval(const LLTreeNode<LLVolumeTriangle>* node, LLVolumeTriangle* tri) { }
|
||||
virtual void handleDestruction(const LLTreeNode<LLVolumeTriangle>* node) { }
|
||||
|
||||
|
||||
public:
|
||||
LLVector4a mBounds[2]; // bounding box (center, size) of this node and all its children (tight fit to objects)
|
||||
LLVector4a mExtents[2]; // extents (min, max) of this node and all its children
|
||||
};
|
||||
|
||||
class LLOctreeTriangleRayIntersect : public LLOctreeTraveler<LLVolumeTriangle>
|
||||
{
|
||||
public:
|
||||
const LLVolumeFace* mFace;
|
||||
LLVector4a mStart;
|
||||
LLVector4a mDir;
|
||||
LLVector4a mEnd;
|
||||
LLVector3* mIntersection;
|
||||
LLVector2* mTexCoord;
|
||||
LLVector3* mNormal;
|
||||
LLVector3* mBinormal;
|
||||
F32* mClosestT;
|
||||
bool mHitFace;
|
||||
|
||||
LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir,
|
||||
const LLVolumeFace* face, F32* closest_t,
|
||||
LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal);
|
||||
|
||||
void traverse(const LLOctreeNode<LLVolumeTriangle>* node);
|
||||
|
||||
virtual void visit(const LLOctreeNode<LLVolumeTriangle>* node);
|
||||
};
|
||||
|
||||
class LLVolumeOctreeValidate : public LLOctreeTraveler<LLVolumeTriangle>
|
||||
{
|
||||
virtual void visit(const LLOctreeNode<LLVolumeTriangle>* branch);
|
||||
};
|
||||
#endif //0
|
||||
#endif
|
||||
Reference in New Issue
Block a user