345 lines
12 KiB
C++
345 lines
12 KiB
C++
/**
|
|
* @file llvector4a.h
|
|
* @brief LLVector4a class header file - memory aligned and vectorized 4 component vector
|
|
*
|
|
* $LicenseInfo:firstyear=2010&license=viewerlgpl$
|
|
* Second Life Viewer Source Code
|
|
* Copyright (C) 2010, Linden Research, Inc.
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation;
|
|
* version 2.1 of the License only.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*
|
|
* Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
|
|
* $/LicenseInfo$
|
|
*/
|
|
|
|
#ifndef LL_LLVECTOR4A_H
|
|
#define LL_LLVECTOR4A_H
|
|
|
|
|
|
class LLRotation;
|
|
|
|
#include <assert.h>
|
|
#include "llpreprocessor.h"
|
|
#include "llmemory.h"
|
|
|
|
///////////////////////////////////
|
|
// FIRST TIME USERS PLEASE READ
|
|
//////////////////////////////////
|
|
// This is just the beginning of LLVector4a. There are many more useful functions
|
|
// yet to be implemented. For example, setNeg to negate a vector, rotate() to apply
|
|
// a matrix rotation, various functions to manipulate only the X, Y, and Z elements
|
|
// and many others (including a whole variety of accessors). So if you don't see a
|
|
// function here that you need, please contact Falcon or someone else with SSE
|
|
// experience (Richard, I think, has some and davep has a little as of the time
|
|
// of this writing, July 08, 2010) about getting it implemented before you resort to
|
|
// LLVector3/LLVector4.
|
|
/////////////////////////////////
|
|
class LLVector4a;
|
|
|
|
LL_ALIGN_PREFIX(16)
|
|
class LLVector4a
|
|
{
|
|
public:
|
|
|
|
///////////////////////////////////
|
|
// STATIC METHODS
|
|
///////////////////////////////////
|
|
|
|
// Call initClass() at startup to avoid 15,000+ cycle penalties from denormalized numbers
|
|
static void initClass()
|
|
{
|
|
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
|
|
}
|
|
|
|
// Return a vector of all zeros
|
|
static inline const LLVector4a& getZero()
|
|
{
|
|
extern const LLVector4a LL_V4A_ZERO;
|
|
return LL_V4A_ZERO;
|
|
}
|
|
|
|
// Return a vector of all epsilon, where epsilon is a small float suitable for approximate equality checks
|
|
static inline const LLVector4a& getEpsilon()
|
|
{
|
|
extern const LLVector4a LL_V4A_EPSILON;
|
|
return LL_V4A_EPSILON;
|
|
}
|
|
|
|
// Copy 16 bytes from src to dst. Source and destination must be 16-byte aligned
|
|
static inline void copy4a(F32* dst, const F32* src)
|
|
{
|
|
_mm_store_ps(dst, _mm_load_ps(src));
|
|
}
|
|
|
|
// Copy words 16-byte blocks from src to dst. Source and destination must not overlap.
|
|
// Source and dest must be 16-byte aligned and size must be multiple of 16.
|
|
static void memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes);
|
|
|
|
////////////////////////////////////
|
|
// CONSTRUCTORS
|
|
////////////////////////////////////
|
|
|
|
LLVector4a()
|
|
{ //DO NOT INITIALIZE -- The overhead is completely unnecessary
|
|
ll_assert_aligned(this,16);
|
|
}
|
|
|
|
LLVector4a(F32 x, F32 y, F32 z, F32 w = 0.f)
|
|
{
|
|
set(x,y,z,w);
|
|
}
|
|
|
|
LLVector4a(F32 x)
|
|
{
|
|
splat(x);
|
|
}
|
|
|
|
LLVector4a(const LLSimdScalar& x)
|
|
{
|
|
splat(x);
|
|
}
|
|
|
|
LLVector4a(LLQuad q)
|
|
{
|
|
mQ = q;
|
|
}
|
|
|
|
////////////////////////////////////
|
|
// LOAD/STORE
|
|
////////////////////////////////////
|
|
|
|
// Load from 16-byte aligned src array (preferred method of loading)
|
|
inline void load4a(const F32* src);
|
|
|
|
// Load from unaligned src array (NB: Significantly slower than load4a)
|
|
inline void loadua(const F32* src);
|
|
|
|
// Load only three floats beginning at address 'src'. Slowest method.
|
|
inline void load3(const F32* src, const F32 w=0.f);
|
|
|
|
// Store to a 16-byte aligned memory address
|
|
inline void store4a(F32* dst) const;
|
|
|
|
////////////////////////////////////
|
|
// BASIC GET/SET
|
|
////////////////////////////////////
|
|
|
|
// Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
|
|
inline F32* getF32ptr();
|
|
|
|
// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
|
|
inline const F32* const getF32ptr() const;
|
|
|
|
// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
|
|
// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
|
|
inline F32 operator[](const S32 idx) const;
|
|
|
|
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
|
|
inline LLSimdScalar getScalarAt(const S32 idx) const;
|
|
|
|
// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
|
|
template <int N> LL_FORCE_INLINE LLSimdScalar getScalarAt() const;
|
|
|
|
// Set to an x, y, z and optional w provided
|
|
inline void set(F32 x, F32 y, F32 z, F32 w = 0.f);
|
|
|
|
// Set to all zeros. This is preferred to using ::getZero()
|
|
inline void clear();
|
|
|
|
// Set all elements to 'x'
|
|
inline void splat(const F32 x);
|
|
|
|
// Set all elements to 'x'
|
|
inline void splat(const LLSimdScalar& x);
|
|
|
|
// Set all 4 elements to element N of src, with N known at compile time
|
|
template <int N> void splat(const LLVector4a& src);
|
|
|
|
// Set all 4 elements to element i of v, with i NOT known at compile time
|
|
inline void splat(const LLVector4a& v, U32 i);
|
|
|
|
// Sets element N to that of src's element N. Much cleaner than.. {LLVector4Logical mask; mask.clear(); mask.setElement<N>(); target.setSelectWithMask(mask,src,target);}
|
|
template <int N> inline void copyComponent(const LLVector4a& src);
|
|
|
|
// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
|
|
inline void setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse );
|
|
|
|
////////////////////////////////////
|
|
// ALGEBRAIC
|
|
////////////////////////////////////
|
|
|
|
// Set this to the element-wise (a + b)
|
|
inline void setAdd(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Set this to element-wise (a - b)
|
|
inline void setSub(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Set this to element-wise multiply (a * b)
|
|
inline void setMul(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Set this to element-wise quotient (a / b)
|
|
inline void setDiv(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Set this to the element-wise absolute value of src
|
|
inline void setAbs(const LLVector4a& src);
|
|
|
|
// Add to each component in this vector the corresponding component in rhs
|
|
inline void add(const LLVector4a& rhs);
|
|
|
|
// Subtract from each component in this vector the corresponding component in rhs
|
|
inline void sub(const LLVector4a& rhs);
|
|
|
|
// Multiply each component in this vector by the corresponding component in rhs
|
|
inline void mul(const LLVector4a& rhs);
|
|
|
|
// Divide each component in this vector by the corresponding component in rhs
|
|
inline void div(const LLVector4a& rhs);
|
|
|
|
// Multiply this vector by x in a scalar fashion
|
|
inline void mul(const F32 x);
|
|
|
|
// Set this to (a x b) (geometric cross-product)
|
|
inline void setCross3(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Set all elements to the dot product of the x, y, and z elements in a and b
|
|
inline void setAllDot3(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Set all elements to the dot product of the x, y, z, and w elements in a and b
|
|
inline void setAllDot4(const LLVector4a& a, const LLVector4a& b);
|
|
|
|
// Return the 3D dot product of this vector and b
|
|
inline LLSimdScalar dot3(const LLVector4a& b) const;
|
|
|
|
// Return the 4D dot product of this vector and b
|
|
inline LLSimdScalar dot4(const LLVector4a& b) const;
|
|
|
|
// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
|
|
// Note that this does not consider zero length vectors!
|
|
inline void normalize3();
|
|
|
|
// Same as normalize3() but with respect to all 4 components
|
|
inline void normalize4();
|
|
|
|
// Same as normalize3(), but returns length as a SIMD scalar
|
|
inline LLSimdScalar normalize3withLength();
|
|
|
|
// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
|
|
// Note that this does not consider zero length vectors!
|
|
inline void normalize3fast();
|
|
|
|
// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
|
|
// Same as above except substitutes default vector contents if the vector is non-finite or degenerate due to zero length.
|
|
//
|
|
inline void normalize3fast_checked(LLVector4a* d = 0);
|
|
|
|
// Return true if this vector is normalized with respect to x,y,z up to tolerance
|
|
inline LLBool32 isNormalized3( F32 tolerance = 1e-3 ) const;
|
|
|
|
// Return true if this vector is normalized with respect to all components up to tolerance
|
|
inline LLBool32 isNormalized4( F32 tolerance = 1e-3 ) const;
|
|
|
|
// Set all elements to the length of vector 'v'
|
|
inline void setAllLength3( const LLVector4a& v );
|
|
|
|
// Get this vector's length
|
|
inline LLSimdScalar getLength3() const;
|
|
|
|
// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
|
|
inline void setMin(const LLVector4a& lhs, const LLVector4a& rhs);
|
|
|
|
// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
|
|
inline void setMax(const LLVector4a& lhs, const LLVector4a& rhs);
|
|
|
|
// Clamps this vector to be within the component-wise range low to high (inclusive)
|
|
inline void clamp( const LLVector4a& low, const LLVector4a& high );
|
|
|
|
// Set this to (c * lhs) + rhs * ( 1 - c)
|
|
inline void setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c);
|
|
|
|
// Return true (nonzero) if x, y, z (and w for Finite4) are all finite floats
|
|
inline LLBool32 isFinite3() const;
|
|
inline LLBool32 isFinite4() const;
|
|
|
|
// Set this vector to 'vec' rotated by the LLRotation or LLQuaternion2 provided
|
|
void setRotated( const LLRotation& rot, const LLVector4a& vec );
|
|
void setRotated( const class LLQuaternion2& quat, const LLVector4a& vec );
|
|
|
|
// Set this vector to 'vec' rotated by the INVERSE of the LLRotation or LLQuaternion2 provided
|
|
inline void setRotatedInv( const LLRotation& rot, const LLVector4a& vec );
|
|
inline void setRotatedInv( const class LLQuaternion2& quat, const LLVector4a& vec );
|
|
|
|
// Quantize this vector to 8 or 16 bit precision
|
|
void quantize8( const LLVector4a& low, const LLVector4a& high );
|
|
void quantize16( const LLVector4a& low, const LLVector4a& high );
|
|
|
|
void negate();
|
|
|
|
////////////////////////////////////
|
|
// LOGICAL
|
|
////////////////////////////////////
|
|
// The functions in this section will compare the elements in this vector
|
|
// to those in rhs and return an LLVector4Logical with all bits set in elements
|
|
// where the comparison was true and all bits unset in elements where the comparison
|
|
// was false. See llvector4logica.h
|
|
////////////////////////////////////
|
|
// WARNING: Other than equals3 and equals4, these functions do NOT account
|
|
// for floating point tolerance. You should include the appropriate tolerance
|
|
// in the inputs.
|
|
////////////////////////////////////
|
|
|
|
inline LLVector4Logical greaterThan(const LLVector4a& rhs) const;
|
|
|
|
inline LLVector4Logical lessThan(const LLVector4a& rhs) const;
|
|
|
|
inline LLVector4Logical greaterEqual(const LLVector4a& rhs) const;
|
|
|
|
inline LLVector4Logical lessEqual(const LLVector4a& rhs) const;
|
|
|
|
inline LLVector4Logical equal(const LLVector4a& rhs) const;
|
|
|
|
// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
|
|
inline bool equals4(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
|
|
|
|
inline bool equals3(const LLVector4a& rhs, F32 tolerance = F_APPROXIMATELY_ZERO ) const;
|
|
|
|
////////////////////////////////////
|
|
// OPERATORS
|
|
////////////////////////////////////
|
|
|
|
// Do NOT add aditional operators without consulting someone with SSE experience
|
|
inline const LLVector4a& operator= ( const LLVector4a& rhs );
|
|
|
|
inline const LLVector4a& operator= ( const LLQuad& rhs );
|
|
|
|
inline operator LLQuad() const;
|
|
|
|
private:
|
|
LLQuad mQ;
|
|
} LL_ALIGN_POSTFIX(16);
|
|
|
|
inline void update_min_max(LLVector4a& min, LLVector4a& max, const LLVector4a& p)
|
|
{
|
|
min.setMin(min, p);
|
|
max.setMax(max, p);
|
|
}
|
|
|
|
inline std::ostream& operator<<(std::ostream& s, const LLVector4a& v)
|
|
{
|
|
s << "(" << v[0] << ", " << v[1] << ", " << v[2] << ", " << v[3] << ")";
|
|
return s;
|
|
}
|
|
#endif
|