Unstaged changes cleanup. Further vectorization. Change in binormal/bitangent calculation.

This commit is contained in:
Shyotl
2013-10-09 14:47:06 -05:00
parent b473661cf4
commit f25eb07fab
51 changed files with 1987 additions and 1895 deletions

View File

@@ -60,7 +60,7 @@ inline void LLMatrix3a::setTranspose(const LLMatrix3a& src)
const LLQuad srcCol1 = src.mColumns[1];
const LLQuad unpacklo = _mm_unpacklo_ps( srcCol0, srcCol1 );
mColumns[0] = _mm_movelh_ps( unpacklo, src.mColumns[2] );
mColumns[1] = _mm_shuffle_ps( unpacklo, src.mColumns[2], _MM_SHUFFLE(0, 1, 3, 2) );
mColumns[1] = _mm_shuffle_ps( _mm_movehl_ps( srcCol0, unpacklo ), src.mColumns[2], _MM_SHUFFLE(0, 1, 1, 0) );
mColumns[2] = _mm_shuffle_ps( _mm_unpackhi_ps( srcCol0, srcCol1 ), src.mColumns[2], _MM_SHUFFLE(0, 2, 1, 0) );
}

View File

@@ -39,34 +39,6 @@
#include <stdint.h>
#endif
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS(T* address)
{
return reinterpret_cast<T*>(
(reinterpret_cast<uintptr_t>(address) + 0xF) & ~0xF);
}
template <typename T> T* LL_NEXT_ALIGNED_ADDRESS_64(T* address)
{
return reinterpret_cast<T*>(
(reinterpret_cast<uintptr_t>(address) + 0x3F) & ~0x3F);
}
#if LL_LINUX || LL_DARWIN
#define LL_ALIGN_PREFIX(x)
#define LL_ALIGN_POSTFIX(x) __attribute__((aligned(x)))
#elif LL_WINDOWS
#define LL_ALIGN_PREFIX(x) __declspec(align(x))
#define LL_ALIGN_POSTFIX(x)
#else
#error "LL_ALIGN_PREFIX and LL_ALIGN_POSTFIX undefined"
#endif
#define LL_ALIGN_16(var) LL_ALIGN_PREFIX(16) var LL_ALIGN_POSTFIX(16)
#include <xmmintrin.h>
#include <emmintrin.h>

View File

@@ -41,55 +41,7 @@ extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F
/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
{
assert(src != NULL);
assert(dst != NULL);
assert(bytes > 0);
assert((bytes % sizeof(F32))== 0);
ll_assert_aligned(src,16);
ll_assert_aligned(dst,16);
assert(bytes%16==0);
F32* end = dst + (bytes / sizeof(F32) );
if (bytes > 64)
{
F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
//at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
F32* end_64 = end-16;
_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
while (dst < begin_64)
{
copy4a(dst, src);
dst += 4;
src += 4;
}
while (dst < end_64)
{
_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
copy4a(dst, src);
copy4a(dst+4, src+4);
copy4a(dst+8, src+8);
copy4a(dst+12, src+12);
dst += 16;
src += 16;
}
}
while (dst < end)
{
copy4a(dst, src);
dst += 4;
src += 4;
}
ll_memcpy_nonaliased_aligned_16((char*)dst, (char*)src, bytes);
}
void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )

View File

@@ -46,6 +46,7 @@ class LLRotation;
// of this writing, July 08, 2010) about getting it implemented before you resort to
// LLVector3/LLVector4.
/////////////////////////////////
class LLVector4a;
LL_ALIGN_PREFIX(16)
class LLVector4a
@@ -235,6 +236,11 @@ public:
// Note that this does not consider zero length vectors!
inline void normalize3fast();
// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
// Same as above except substitutes default vector contents if the vector is non-finite or degenerate due to zero length.
//
inline void normalize3fast_checked(LLVector4a* d = 0);
// Return true if this vector is normalized with respect to x,y,z up to tolerance
inline LLBool32 isNormalized3( F32 tolerance = 1e-3 ) const;

View File

@@ -409,6 +409,26 @@ inline void LLVector4a::normalize3fast()
mQ = _mm_mul_ps( mQ, approxRsqrt );
}
inline void LLVector4a::normalize3fast_checked(LLVector4a* d)
{
if (!isFinite3())
{
*this = d ? *d : LLVector4a(0,1,0,1);
return;
}
LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
if (lenSqrd.getF32ptr()[0] <= FLT_EPSILON)
{
*this = d ? *d : LLVector4a(0,1,0,1);
return;
}
const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
mQ = _mm_mul_ps( mQ, approxRsqrt );
}
// Return true if this vector is normalized with respect to x,y,z up to tolerance
inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
{
@@ -460,21 +480,19 @@ inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs)
mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
}
// Set this to (c * lhs) + rhs * ( 1 - c)
// Set this to lhs + (rhs-lhs)*c
inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
{
LLVector4a a = lhs;
a.mul(c);
LLVector4a b = rhs;
b.mul(1.f-c);
setAdd(a, b);
LLVector4a t;
t.setSub(rhs,lhs);
t.mul(c);
setAdd(lhs, t);
}
inline LLBool32 LLVector4a::isFinite3() const
{
static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
ll_assert_aligned(nanOrInfMask,16);
const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));

File diff suppressed because it is too large Load Diff

View File

@@ -37,7 +37,6 @@ class LLPath;
template <class T> class LLOctreeNode;
class LLVector4a;
class LLVolumeFace;
class LLVolume;
class LLVolumeTriangle;
@@ -50,12 +49,15 @@ class LLVolumeTriangle;
#include "v3math.h"
#include "v3dmath.h"
#include "v4math.h"
#include "llvector4a.h"
#include "llmatrix4a.h"
#include "llquaternion.h"
#include "llstrider.h"
#include "v4coloru.h"
#include "llrefcount.h"
#include "llpointer.h"
#include "llfile.h"
#include "llalignedarray.h"
//============================================================================
@@ -709,16 +711,16 @@ public:
LLFaceID mFaceID;
};
std::vector<LLVector3> mProfile;
std::vector<LLVector2> mNormals;
LLAlignedArray<LLVector4a, 64> mProfile;
//LLAlignedArray<LLVector4a, 64> mNormals;
std::vector<Face> mFaces;
std::vector<LLVector3> mEdgeNormals;
std::vector<LLVector3> mEdgeCenters;
//LLAlignedArray<LLVector4a, 64> mEdgeNormals;
//LLAlignedArray<LLVector4a, 64> mEdgeCenters;
friend std::ostream& operator<<(std::ostream &s, const LLProfile &profile);
protected:
void genNormals(const LLProfileParams& params);
static S32 getNumNGonPoints(const LLProfileParams& params, S32 sides, F32 offset=0.0f, F32 bevel = 0.0f, F32 ang_scale = 1.f, S32 split = 0);
void genNGon(const LLProfileParams& params, S32 sides, F32 offset=0.0f, F32 bevel = 0.0f, F32 ang_scale = 1.f, S32 split = 0);
@@ -742,13 +744,29 @@ protected:
class LLPath
{
public:
struct PathPt
class PathPt
{
LLVector3 mPos;
LLVector2 mScale;
LLQuaternion mRot;
public:
LLMatrix4a mRot;
LLVector4a mPos;
LLVector4a mScale;
F32 mTexT;
PathPt() { mPos.setVec(0,0,0); mTexT = 0; mScale.setVec(0,0); mRot.loadIdentity(); }
F32 pad[3]; //for alignment
PathPt()
{
mPos.clear();
mTexT = 0;
mScale.clear();
mRot.setRows(LLVector4a(1,0,0,0),
LLVector4a(0,1,0,0),
LLVector4a(0,0,1,0));
//distinguished data in the pad for debugging
pad[0] = 3.14159f;
pad[1] = -3.14159f;
pad[2] = 0.585f;
}
};
public:
@@ -780,7 +798,7 @@ public:
friend std::ostream& operator<<(std::ostream &s, const LLPath &path);
public:
std::vector<PathPt> mPath;
LLAlignedArray<PathPt, 64> mPath;
protected:
BOOL mOpen;
@@ -845,12 +863,12 @@ private:
public:
BOOL create(LLVolume* volume, BOOL partial_build = FALSE);
void createBinormals();
void createTangents();
void appendFace(const LLVolumeFace& face, LLMatrix4& transform, LLMatrix4& normal_tranform);
void resizeVertices(S32 num_verts);
void allocateBinormals(S32 num_verts);
void allocateTangents(S32 num_verts);
void allocateWeights(S32 num_verts);
void resizeIndices(S32 num_indices);
void fillFromLegacyData(std::vector<LLVolumeFace::VertexData>& v, std::vector<U16>& idx);
@@ -913,11 +931,12 @@ public:
LLVector2 mTexCoordExtents[2]; //minimum and maximum of texture coordinates of the face.
S32 mNumVertices;
S32 mNumAllocatedVertices;
S32 mNumIndices;
LLVector4a* mPositions;
LLVector4a* mNormals;
LLVector4a* mBinormals;
LLVector4a* mTangents;
LLVector2* mTexCoords;
U16* mIndices;
@@ -931,9 +950,12 @@ public:
// format is mWeights[vertex_index].mV[influence] = <joint_index>.<weight>
// mWeights.size() should be empty or match mVertices.size()
LLVector4a* mWeights;
LLOctreeNode<LLVolumeTriangle>* mOctree;
//whether or not face has been cache optimized
BOOL mOptimized;
private:
BOOL createUnCutCubeCap(LLVolume* volume, BOOL partial_build = FALSE);
BOOL createCap(LLVolume* volume, BOOL partial_build = FALSE);
@@ -945,15 +967,10 @@ class LLVolume : public LLRefCount
friend class LLVolumeLODGroup;
protected:
LLVolume(const LLVolume&); // Don't implement
~LLVolume(); // use unref
public:
struct Point
{
LLVector3 mPos;
};
struct FaceParams
{
LLFaceID mFaceID;
@@ -976,13 +993,13 @@ public:
const LLProfile& getProfile() const { return *mProfilep; }
LLPath& getPath() const { return *mPathp; }
void resizePath(S32 length);
const std::vector<Point>& getMesh() const { return mMesh; }
const LLVector3& getMeshPt(const U32 i) const { return mMesh[i].mPos; }
const LLAlignedArray<LLVector4a,64>& getMesh() const { return mMesh; }
const LLVector4a& getMeshPt(const U32 i) const { return mMesh[i]; }
void setDirty() { mPathp->setDirty(); mProfilep->setDirty(); }
void regen();
void genBinormals(S32 face);
void genTangents(S32 face);
BOOL isConvex() const;
BOOL isCap(S32 face);
@@ -992,10 +1009,7 @@ public:
S32 getSculptLevel() const { return mSculptLevel; }
void setSculptLevel(S32 level) { mSculptLevel = level; }
S32 *getTriangleIndices(U32 &num_indices) const;
// returns number of triangle indeces required for path/profile mesh
S32 getNumTriangleIndices() const;
static void getLoDTriangleCounts(const LLVolumeParams& params, S32* counts);
S32 getNumTriangles(S32* vcount = NULL) const;
@@ -1010,21 +1024,14 @@ public:
//get the face index of the face that intersects with the given line segment at the point
//closest to start. Moves end to the point of intersection. Returns -1 if no intersection.
//Line segment must be in volume space.
S32 lineSegmentIntersect(const LLVector3& start, const LLVector3& end,
S32 lineSegmentIntersect(const LLVector4a& start, const LLVector4a& end,
S32 face = -1, // which face to check, -1 = ALL_SIDES
LLVector3* intersection = NULL, // return the intersection point
LLVector4a* intersection = NULL, // return the intersection point
LLVector2* tex_coord = NULL, // return the texture coordinates of the intersection point
LLVector3* normal = NULL, // return the surface normal at the intersection point
LLVector3* bi_normal = NULL // return the surface bi-normal at the intersection point
LLVector4a* normal = NULL, // return the surface normal at the intersection point
LLVector4a* tangent = NULL // return the surface tangent at the intersection point
);
S32 lineSegmentIntersect(const LLVector4a& start, const LLVector4a& end,
S32 face = 1,
LLVector3* intersection = NULL,
LLVector2* tex_coord = NULL,
LLVector3* normal = NULL,
LLVector3* bi_normal = NULL);
LLFaceID generateFaceMask();
BOOL isFaceMaskValid(LLFaceID face_mask);
@@ -1068,7 +1075,8 @@ public:
LLVolumeParams mParams;
LLPath *mPathp;
LLProfile *mProfilep;
std::vector<Point> mMesh;
LLAlignedArray<LLVector4a,64> mMesh;
BOOL mGenerateSingleFace;
typedef std::vector<LLVolumeFace> face_list_t;
@@ -1083,21 +1091,12 @@ public:
std::ostream& operator<<(std::ostream &s, const LLVolumeParams &volume_params);
void calc_binormal_from_triangle(
LLVector4a& binormal,
const LLVector4a& pos0,
const LLVector2& tex0,
const LLVector4a& pos1,
const LLVector2& tex1,
const LLVector4a& pos2,
const LLVector2& tex2);
BOOL LLLineSegmentBoxIntersect(const F32* start, const F32* end, const F32* center, const F32* size);
BOOL LLLineSegmentBoxIntersect(const LLVector3& start, const LLVector3& end, const LLVector3& center, const LLVector3& size);
BOOL LLLineSegmentBoxIntersect(const LLVector4a& start, const LLVector4a& end, const LLVector4a& center, const LLVector4a& size);
BOOL LLTriangleRayIntersect(const LLVector3& vert0, const LLVector3& vert1, const LLVector3& vert2, const LLVector3& orig, const LLVector3& dir,
F32& intersection_a, F32& intersection_b, F32& intersection_t, BOOL two_sided);
//BOOL LLTriangleRayIntersect(const LLVector3& vert0, const LLVector3& vert1, const LLVector3& vert2, const LLVector3& orig, const LLVector3& dir,
// F32& intersection_a, F32& intersection_b, F32& intersection_t, BOOL two_sided);
BOOL LLTriangleRayIntersect(const LLVector4a& vert0, const LLVector4a& vert1, const LLVector4a& vert2, const LLVector4a& orig, const LLVector4a& dir,
F32& intersection_a, F32& intersection_b, F32& intersection_t);

View File

@@ -94,14 +94,14 @@ void LLVolumeOctreeListener::handleChildAddition(const LLOctreeNode<LLVolumeTria
LLOctreeTriangleRayIntersect::LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir,
const LLVolumeFace* face, F32* closest_t,
LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal)
LLVector4a* intersection,LLVector2* tex_coord, LLVector4a* normal, LLVector4a* tangent)
: mFace(face),
mStart(start),
mDir(dir),
mIntersection(intersection),
mTexCoord(tex_coord),
mNormal(normal),
mBinormal(bi_normal),
mTangent(tangent),
mClosestT(closest_t),
mHitFace(false)
{
@@ -112,13 +112,7 @@ void LLOctreeTriangleRayIntersect::traverse(const LLOctreeNode<LLVolumeTriangle>
{
LLVolumeOctreeListener* vl = (LLVolumeOctreeListener*) node->getListener(0);
/*const F32* start = mStart.getF32();
const F32* end = mEnd.getF32();
const F32* center = vl->mBounds[0].getF32();
const F32* size = vl->mBounds[1].getF32();*/
//if (LLLineSegmentBoxIntersect(mStart, mEnd, vl->mBounds[0], vl->mBounds[1]))
if (LLLineSegmentBoxIntersect(mStart.getF32ptr(), mEnd.getF32ptr(), vl->mBounds[0].getF32ptr(), vl->mBounds[1].getF32ptr()))
if (LLLineSegmentBoxIntersect(mStart, mEnd, vl->mBounds[0], vl->mBounds[1]))
{
node->accept(this);
for (U32 i = 0; i < node->getChildCount(); ++i)
@@ -152,34 +146,60 @@ void LLOctreeTriangleRayIntersect::visit(const LLOctreeNode<LLVolumeTriangle>* n
LLVector4a intersect = mDir;
intersect.mul(*mClosestT);
intersect.add(mStart);
mIntersection->set(intersect.getF32ptr());
*mIntersection = intersect;
}
U32 idx0 = tri->mIndex[0];
U32 idx1 = tri->mIndex[1];
U32 idx2 = tri->mIndex[2];
if (mTexCoord != NULL)
{
LLVector2* tc = (LLVector2*) mFace->mTexCoords;
*mTexCoord = ((1.f - a - b) * tc[tri->mIndex[0]] +
a * tc[tri->mIndex[1]] +
b * tc[tri->mIndex[2]]);
*mTexCoord = ((1.f - a - b) * tc[idx0] +
a * tc[idx1] +
b * tc[idx2]);
}
if (mNormal != NULL)
{
LLVector4* norm = (LLVector4*) mFace->mNormals;
LLVector4a* norm = mFace->mNormals;
LLVector4a n1,n2,n3;
n1 = norm[idx0];
n1.mul(1.f-a-b);
n2 = norm[idx1];
n2.mul(a);
n3 = norm[idx2];
n3.mul(b);
*mNormal = ((1.f - a - b) * LLVector3(norm[tri->mIndex[0]]) +
a * LLVector3(norm[tri->mIndex[1]]) +
b * LLVector3(norm[tri->mIndex[2]]));
n1.add(n2);
n1.add(n3);
*mNormal = n1;
}
if (mBinormal != NULL)
if (mTangent != NULL)
{
LLVector4* binormal = (LLVector4*) mFace->mBinormals;
*mBinormal = ((1.f - a - b) * LLVector3(binormal[tri->mIndex[0]]) +
a * LLVector3(binormal[tri->mIndex[1]]) +
b * LLVector3(binormal[tri->mIndex[2]]));
LLVector4a* tangents = mFace->mTangents;
LLVector4a t1,t2,t3;
t1 = tangents[idx0];
t1.mul(1.f-a-b);
t2 = tangents[idx1];
t2.mul(a);
t3 = tangents[idx2];
t3.mul(b);
t1.add(t2);
t1.add(t3);
*mTangent = t1;
}
}
}

View File

@@ -134,16 +134,16 @@ public:
LLVector4a mStart;
LLVector4a mDir;
LLVector4a mEnd;
LLVector3* mIntersection;
LLVector4a* mIntersection;
LLVector2* mTexCoord;
LLVector3* mNormal;
LLVector3* mBinormal;
LLVector4a* mNormal;
LLVector4a* mTangent;
F32* mClosestT;
bool mHitFace;
LLOctreeTriangleRayIntersect(const LLVector4a& start, const LLVector4a& dir,
const LLVolumeFace* face, F32* closest_t,
LLVector3* intersection,LLVector2* tex_coord, LLVector3* normal, LLVector3* bi_normal);
LLVector4a* intersection,LLVector2* tex_coord, LLVector4a* normal, LLVector4a* tangent);
void traverse(const LLOctreeNode<LLVolumeTriangle>* node);