PageRenderTime 43ms CodeModel.GetById 30ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 1ms

/indra/llmath/llvector4a.inl

https://bitbucket.org/lindenlab/viewer-beta/
C++ Header | 593 lines | 351 code | 72 blank | 170 comment | 4 complexity | 35ac1670d8371ecb03f92369f2ddb24e MD5 | raw file
  1/** 
  2 * @file llvector4a.inl
  3 * @brief LLVector4a inline function implementations
  4 *
  5 * $LicenseInfo:firstyear=2010&license=viewerlgpl$
  6 * Second Life Viewer Source Code
  7 * Copyright (C) 2010, Linden Research, Inc.
  8 * 
  9 * This library is free software; you can redistribute it and/or
 10 * modify it under the terms of the GNU Lesser General Public
 11 * License as published by the Free Software Foundation;
 12 * version 2.1 of the License only.
 13 * 
 14 * This library is distributed in the hope that it will be useful,
 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 17 * Lesser General Public License for more details.
 18 * 
 19 * You should have received a copy of the GNU Lesser General Public
 20 * License along with this library; if not, write to the Free Software
 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 22 * 
 23 * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
 24 * $/LicenseInfo$
 25 */
 26
 27////////////////////////////////////
 28// LOAD/STORE
 29////////////////////////////////////
 30
 31// Load from 16-byte aligned src array (preferred method of loading)
 32inline void LLVector4a::load4a(const F32* src)
 33{
 34	mQ = _mm_load_ps(src);
 35}
 36
 37// Load from unaligned src array (NB: Significantly slower than load4a)
 38inline void LLVector4a::loadua(const F32* src)
 39{
 40	mQ = _mm_loadu_ps(src);
 41}
 42
 43// Load only three floats beginning at address 'src'. Slowest method.
 44inline void LLVector4a::load3(const F32* src)
 45{
 46	// mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
 47	// NB: This differs from the convention of { Z, Y, X, W }
 48	mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
 49}	
 50
 51// Store to a 16-byte aligned memory address
 52inline void LLVector4a::store4a(F32* dst) const
 53{
 54	_mm_store_ps(dst, mQ);
 55}
 56
 57////////////////////////////////////
 58// BASIC GET/SET 
 59////////////////////////////////////
 60
 61// Return a "this" as an F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
 62F32* LLVector4a::getF32ptr()
 63{
 64	return (F32*) &mQ;
 65}
 66
 67// Return a "this" as a const F32 pointer. Do not use unless you have a very good reason.  (Not sure? Ask Falcon)
 68const F32* const LLVector4a::getF32ptr() const
 69{
 70	return (const F32* const) &mQ;
 71}
 72
 73// Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
 74// the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
 75inline F32 LLVector4a::operator[](const S32 idx) const
 76{
 77	return ((F32*)&mQ)[idx];
 78}	
 79
 80// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
 81inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const
 82{
 83	// Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop)
 84	switch (idx)
 85	{
 86		case 0:
 87			return mQ;
 88		case 1:
 89			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
 90		case 2:
 91			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
 92		case 3:
 93		default:
 94			return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
 95	}
 96}
 97
 98// Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
 99template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const
100{
101	return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
102}
103
104template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const
105{
106	return mQ;
107}
108
109// Set to an x, y, z and optional w provided
110inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
111{
112	mQ = _mm_set_ps(w, z, y, x);
113}
114
115// Set to all zeros
116inline void LLVector4a::clear()
117{
118	mQ = LLVector4a::getZero().mQ;
119}
120
121inline void LLVector4a::splat(const F32 x)
122{
123	mQ = _mm_set1_ps(x);	
124}
125
126inline void LLVector4a::splat(const LLSimdScalar& x)
127{
128	mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) );
129}
130
131// Set all 4 elements to element N of src, with N known at compile time
132template <int N> void LLVector4a::splat(const LLVector4a& src)
133{
134	mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) );
135}
136
137// Set all 4 elements to element i of v, with i NOT known at compile time
138inline void LLVector4a::splat(const LLVector4a& v, U32 i)
139{
140	switch (i)
141	{
142		case 0:
143			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
144			break;
145		case 1:
146			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
147			break;
148		case 2:
149			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
150			break;
151		case 3:
152			mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
153			break;
154	}
155}
156
157// Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
158inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse )
159{
160	// ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse )
161	// E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b
162	// (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b, 
163	// as expected (01 from sourceIfTrue, 10 from sourceIfFalse)
164	// Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
165	mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) );
166}
167
168////////////////////////////////////
169// ALGEBRAIC
170////////////////////////////////////
171
172// Set this to the element-wise (a + b)
173inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b)
174{
175	mQ = _mm_add_ps(a.mQ, b.mQ);
176}
177
178// Set this to element-wise (a - b)
179inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b)
180{
181	mQ = _mm_sub_ps(a.mQ, b.mQ);
182}
183
184// Set this to element-wise multiply (a * b)
185inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b)
186{
187	mQ = _mm_mul_ps(a.mQ, b.mQ);
188}
189
190// Set this to element-wise quotient (a / b)
191inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b)
192{
193	mQ = _mm_div_ps( a.mQ, b.mQ );
194}
195
196// Set this to the element-wise absolute value of src
197inline void LLVector4a::setAbs(const LLVector4a& src)
198{
199	static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
200	mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
201}
202
203// Add to each component in this vector the corresponding component in rhs
204inline void LLVector4a::add(const LLVector4a& rhs)
205{
206	mQ = _mm_add_ps(mQ, rhs.mQ);	
207}
208
209// Subtract from each component in this vector the corresponding component in rhs
210inline void LLVector4a::sub(const LLVector4a& rhs)
211{
212	mQ = _mm_sub_ps(mQ, rhs.mQ);
213}
214
215// Multiply each component in this vector by the corresponding component in rhs
216inline void LLVector4a::mul(const LLVector4a& rhs)
217{
218	mQ = _mm_mul_ps(mQ, rhs.mQ);	
219}
220
221// Divide each component in this vector by the corresponding component in rhs
222inline void LLVector4a::div(const LLVector4a& rhs)
223{
224	// TODO: Check accuracy, maybe add divFast
225	mQ = _mm_div_ps(mQ, rhs.mQ);
226}
227
228// Multiply this vector by x in a scalar fashion
229inline void LLVector4a::mul(const F32 x) 
230{
231	LLVector4a t;
232	t.splat(x);
233	
234	mQ = _mm_mul_ps(mQ, t.mQ);
235}
236
237// Set this to (a x b) (geometric cross-product)
238inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
239{
240	// Vectors are stored in memory in w, z, y, x order from high to low
241	// Set vector1 = { a[W], a[X], a[Z], a[Y] }
242	const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
243	// Set vector2 = { b[W], b[Y], b[X], b[Z] }
244	const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
245	// mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
246	mQ = _mm_mul_ps( vector1, vector2 );
247	// vector3 = { a[W], a[Y], a[X], a[Z] }
248	const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
249	// vector4 = { b[W], b[X], b[Z], b[Y] }
250	const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
251	// mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
252	mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
253}
254
255/* This function works, but may be slightly slower than the one below on older machines
256 inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
257 {
258 // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
259 const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
260 // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
261 const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
262 // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
263 const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
264 // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } 
265 const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
266 // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
267 const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
268 // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
269 mQ = _mm_add_ps(zSplat, xPlusYSplat);
270 }*/
271
272// Set all elements to the dot product of the x, y, and z elements in a and b
273inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
274{
275	// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
276	const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
277	// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
278	const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 ));
279	// xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
280	const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
281	// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } 
282	const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
283	// zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
284	const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
285	// mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
286	mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
287}
288
289// Set all elements to the dot product of the x, y, z, and w elements in a and b
290inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
291{
292	// ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
293	const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
294	// yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
295	const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 ));
296	// zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
297	const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
298	// xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] } 
299	const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
300	const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
301
302	// mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
303	mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
304}
305
306// Return the 3D dot product of this vector and b
307inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
308{
309	const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
310	const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
311	const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
312	const LLQuad xPlusY = _mm_add_ps( ab, splatY );
313	return _mm_add_ps( xPlusY, splatZ );	
314}
315
316// Return the 4D dot product of this vector and b
317inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
318{
319	// ab = { w, z, y, x }
320 	const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
321 	// upperProdsInLowerElems = { y, x, y, x }
322	const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab );
323	// sumOfPairs = { w+y, z+x, 2y, 2x }
324 	const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab );
325	// shuffled = { z+x, z+x, z+x, z+x }
326	const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
327	return _mm_add_ss( sumOfPairs, shuffled );
328}
329
330// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
331// Note that this does not consider zero length vectors!
332inline void LLVector4a::normalize3()
333{
334	// lenSqrd = a dot a
335	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
336	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
337	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
338	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
339	static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
340	// Now we do one round of Newton-Raphson approximation to get full accuracy
341	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
342	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
343	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
344	// = 0.5 * w * (3 - a*w^2)
345	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
346	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
347	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
348	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
349	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
350	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
351	mQ = _mm_mul_ps( mQ, nrApprox );
352}
353
354// Normalize this vector with respect to all components. Accurate to 22 bites of precision.
355// Note that this does not consider zero length vectors!
356inline void LLVector4a::normalize4()
357{
358	// lenSqrd = a dot a
359	LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
360	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
361	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
362	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
363	static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
364	// Now we do one round of Newton-Raphson approximation to get full accuracy
365	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
366	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
367	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
368	// = 0.5 * w * (3 - a*w^2)
369	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
370	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
371	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
372	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
373	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
374	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
375	mQ = _mm_mul_ps( mQ, nrApprox );
376}
377
378// Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
379// Note that this does not consider zero length vectors!
380inline LLSimdScalar LLVector4a::normalize3withLength()
381{
382	// lenSqrd = a dot a
383	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
384	// rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
385	const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
386	static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
387	static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
388	// Now we do one round of Newton-Raphson approximation to get full accuracy
389	// According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
390	// the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
391	// w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
392	// = 0.5 * w * (3 - a*w^2)
393	// Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
394	// which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
395	const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
396	const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
397	const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
398	const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
399	mQ = _mm_mul_ps( mQ, nrApprox );
400	return _mm_sqrt_ss(lenSqrd);
401}
402
403// Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
404// Note that this does not consider zero length vectors!
405inline void LLVector4a::normalize3fast()
406{
407	LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
408	const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
409	mQ = _mm_mul_ps( mQ, approxRsqrt );
410}
411
412// Return true if this vector is normalized with respect to x,y,z up to tolerance
413inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
414{
415	static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
416	LLSimdScalar tol = _mm_load_ss( &tolerance );
417	tol = _mm_mul_ss( tol, tol );
418	LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
419	lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
420	lenSquared.setAbs(lenSquared);
421	return _mm_comile_ss( lenSquared, tol );		
422}
423
424// Return true if this vector is normalized with respect to all components up to tolerance
425inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
426{
427	static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
428	LLSimdScalar tol = _mm_load_ss( &tolerance );
429	tol = _mm_mul_ss( tol, tol );
430	LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
431	lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
432	lenSquared.setAbs(lenSquared);
433	return _mm_comile_ss( lenSquared, tol );		
434}
435
436// Set all elements to the length of vector 'v' 
437inline void LLVector4a::setAllLength3( const LLVector4a& v )
438{
439	LLVector4a lenSqrd;
440	lenSqrd.setAllDot3(v, v);
441	
442	mQ = _mm_sqrt_ps(lenSqrd.mQ);
443}
444
445// Get this vector's length
446inline LLSimdScalar LLVector4a::getLength3() const
447{
448	return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) );
449}
450
451// Set the components of this vector to the minimum of the corresponding components of lhs and rhs
452inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs)
453{
454	mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
455}
456
457// Set the components of this vector to the maximum of the corresponding components of lhs and rhs
458inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs)
459{
460	mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
461}
462
463// Set this to  (c * lhs) + rhs * ( 1 - c)
464inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
465{
466	LLVector4a a = lhs;
467	a.mul(c);
468	
469	LLVector4a b = rhs;
470	b.mul(1.f-c);
471	
472	setAdd(a, b);
473}
474
475inline LLBool32 LLVector4a::isFinite3() const
476{
477	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
478	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
479	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
480	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
481	return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ );
482}
483	
484inline LLBool32 LLVector4a::isFinite4() const
485{
486	static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
487	const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
488	const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
489	const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
490	return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW );
491}
492
493inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec )
494{
495	LLRotation inv; inv.setTranspose( rot );
496	setRotated( inv, vec );
497}
498
499inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec )
500{
501	LLQuaternion2 invRot; invRot.setConjugate( quat );
502	setRotated(invRot, vec);
503}
504
505inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
506{
507	const LLVector4Logical highMask = greaterThan( high );
508	const LLVector4Logical lowMask = lessThan( low );
509
510	setSelectWithMask( highMask, high, *this );
511	setSelectWithMask( lowMask, low, *this );
512}
513
514
515////////////////////////////////////
516// LOGICAL
517////////////////////////////////////	
518// The functions in this section will compare the elements in this vector
519// to those in rhs and return an LLVector4Logical with all bits set in elements
520// where the comparison was true and all bits unset in elements where the comparison
521// was false. See llvector4logica.h
522////////////////////////////////////
523// WARNING: Other than equals3 and equals4, these functions do NOT account
524// for floating point tolerance. You should include the appropriate tolerance
525// in the inputs.
526////////////////////////////////////
527
528inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const
529{	
530	return _mm_cmpgt_ps(mQ, rhs.mQ);
531}
532
533inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const
534{
535	return _mm_cmplt_ps(mQ, rhs.mQ);
536}
537
538inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const
539{
540	return _mm_cmpge_ps(mQ, rhs.mQ);
541}
542
543inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const
544{
545	return _mm_cmple_ps(mQ, rhs.mQ);
546}
547
548inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const
549{
550	return _mm_cmpeq_ps(mQ, rhs.mQ);
551}
552
553// Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
554inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const
555{
556	LLVector4a diff; diff.setSub( *this, rhs );
557	diff.setAbs( diff );
558	const LLQuad tol = _mm_set1_ps( tolerance );
559	const LLQuad cmp = _mm_cmplt_ps( diff, tol );
560	return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW;
561}
562
563inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const
564{
565	LLVector4a diff; diff.setSub( *this, rhs );
566	diff.setAbs( diff );
567	const LLQuad tol = _mm_set1_ps( tolerance );
568	const LLQuad t = _mm_cmplt_ps( diff, tol ); 
569	return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ;
570	
571}
572
573////////////////////////////////////
574// OPERATORS
575////////////////////////////////////	
576
577// Do NOT add aditional operators without consulting someone with SSE experience
578inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs )
579{
580	mQ = rhs.mQ;
581	return *this;
582}
583
584inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs )
585{
586	mQ = rhs;
587	return *this;
588}
589
590inline LLVector4a::operator LLQuad() const
591{
592	return mQ;
593}