PageRenderTime 185ms CodeModel.GetById 25ms RepoModel.GetById 2ms app.codeStats 1ms

/indra/llmath/llvector4a.inl

https://bitbucket.org/lindenlab/viewer-beta/
C++ Header | 593 lines | 351 code | 72 blank | 170 comment | 4 complexity | 35ac1670d8371ecb03f92369f2ddb24e MD5 | raw file
Possible License(s): LGPL-2.1
  1. /**
  2. * @file llvector4a.inl
  3. * @brief LLVector4a inline function implementations
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewerlgpl$
  6. * Second Life Viewer Source Code
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * This library is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation;
  12. * version 2.1 of the License only.
  13. *
  14. * This library is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with this library; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. *
  23. * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
  24. * $/LicenseInfo$
  25. */
  26. ////////////////////////////////////
  27. // LOAD/STORE
  28. ////////////////////////////////////
  29. // Load from 16-byte aligned src array (preferred method of loading)
  30. inline void LLVector4a::load4a(const F32* src)
  31. {
  32. mQ = _mm_load_ps(src);
  33. }
  34. // Load from unaligned src array (NB: Significantly slower than load4a)
  35. inline void LLVector4a::loadua(const F32* src)
  36. {
  37. mQ = _mm_loadu_ps(src);
  38. }
  39. // Load only three floats beginning at address 'src'. Slowest method.
  40. inline void LLVector4a::load3(const F32* src)
  41. {
  42. // mQ = { 0.f, src[2], src[1], src[0] } = { W, Z, Y, X }
  43. // NB: This differs from the convention of { Z, Y, X, W }
  44. mQ = _mm_set_ps(0.f, src[2], src[1], src[0]);
  45. }
  46. // Store to a 16-byte aligned memory address
  47. inline void LLVector4a::store4a(F32* dst) const
  48. {
  49. _mm_store_ps(dst, mQ);
  50. }
  51. ////////////////////////////////////
  52. // BASIC GET/SET
  53. ////////////////////////////////////
  54. // Return a "this" as an F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
  55. F32* LLVector4a::getF32ptr()
  56. {
  57. return (F32*) &mQ;
  58. }
  59. // Return a "this" as a const F32 pointer. Do not use unless you have a very good reason. (Not sure? Ask Falcon)
  60. const F32* const LLVector4a::getF32ptr() const
  61. {
  62. return (const F32* const) &mQ;
  63. }
  64. // Read-only access a single float in this vector. Do not use in proximity to any function call that manipulates
  65. // the data at the whole vector level or you will incur a substantial penalty. Consider using the splat functions instead
  66. inline F32 LLVector4a::operator[](const S32 idx) const
  67. {
  68. return ((F32*)&mQ)[idx];
  69. }
  70. // Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
  71. inline LLSimdScalar LLVector4a::getScalarAt(const S32 idx) const
  72. {
  73. // Return appropriate LLQuad. It will be cast to LLSimdScalar automatically (should be effectively a nop)
  74. switch (idx)
  75. {
  76. case 0:
  77. return mQ;
  78. case 1:
  79. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(1, 1, 1, 1));
  80. case 2:
  81. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(2, 2, 2, 2));
  82. case 3:
  83. default:
  84. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(3, 3, 3, 3));
  85. }
  86. }
  87. // Prefer this method for read-only access to a single element. Prefer the templated version if the elem is known at compile time.
  88. template <int N> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt() const
  89. {
  90. return _mm_shuffle_ps(mQ, mQ, _MM_SHUFFLE(N, N, N, N));
  91. }
  92. template<> LL_FORCE_INLINE LLSimdScalar LLVector4a::getScalarAt<0>() const
  93. {
  94. return mQ;
  95. }
  96. // Set to an x, y, z and optional w provided
  97. inline void LLVector4a::set(F32 x, F32 y, F32 z, F32 w)
  98. {
  99. mQ = _mm_set_ps(w, z, y, x);
  100. }
  101. // Set to all zeros
  102. inline void LLVector4a::clear()
  103. {
  104. mQ = LLVector4a::getZero().mQ;
  105. }
  106. inline void LLVector4a::splat(const F32 x)
  107. {
  108. mQ = _mm_set1_ps(x);
  109. }
  110. inline void LLVector4a::splat(const LLSimdScalar& x)
  111. {
  112. mQ = _mm_shuffle_ps( x.getQuad(), x.getQuad(), _MM_SHUFFLE(0,0,0,0) );
  113. }
  114. // Set all 4 elements to element N of src, with N known at compile time
  115. template <int N> void LLVector4a::splat(const LLVector4a& src)
  116. {
  117. mQ = _mm_shuffle_ps(src.mQ, src.mQ, _MM_SHUFFLE(N, N, N, N) );
  118. }
  119. // Set all 4 elements to element i of v, with i NOT known at compile time
  120. inline void LLVector4a::splat(const LLVector4a& v, U32 i)
  121. {
  122. switch (i)
  123. {
  124. case 0:
  125. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(0, 0, 0, 0));
  126. break;
  127. case 1:
  128. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(1, 1, 1, 1));
  129. break;
  130. case 2:
  131. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(2, 2, 2, 2));
  132. break;
  133. case 3:
  134. mQ = _mm_shuffle_ps(v.mQ, v.mQ, _MM_SHUFFLE(3, 3, 3, 3));
  135. break;
  136. }
  137. }
  138. // Select bits from sourceIfTrue and sourceIfFalse according to bits in mask
  139. inline void LLVector4a::setSelectWithMask( const LLVector4Logical& mask, const LLVector4a& sourceIfTrue, const LLVector4a& sourceIfFalse )
  140. {
  141. // ((( sourceIfTrue ^ sourceIfFalse ) & mask) ^ sourceIfFalse )
  142. // E.g., sourceIfFalse = 1010b, sourceIfTrue = 0101b, mask = 1100b
  143. // (sourceIfTrue ^ sourceIfFalse) = 1111b --> & mask = 1100b --> ^ sourceIfFalse = 0110b,
  144. // as expected (01 from sourceIfTrue, 10 from sourceIfFalse)
  145. // Courtesy of Mark++, http://markplusplus.wordpress.com/2007/03/14/fast-sse-select-operation/
  146. mQ = _mm_xor_ps( sourceIfFalse, _mm_and_ps( mask, _mm_xor_ps( sourceIfTrue, sourceIfFalse ) ) );
  147. }
  148. ////////////////////////////////////
  149. // ALGEBRAIC
  150. ////////////////////////////////////
  151. // Set this to the element-wise (a + b)
  152. inline void LLVector4a::setAdd(const LLVector4a& a, const LLVector4a& b)
  153. {
  154. mQ = _mm_add_ps(a.mQ, b.mQ);
  155. }
  156. // Set this to element-wise (a - b)
  157. inline void LLVector4a::setSub(const LLVector4a& a, const LLVector4a& b)
  158. {
  159. mQ = _mm_sub_ps(a.mQ, b.mQ);
  160. }
  161. // Set this to element-wise multiply (a * b)
  162. inline void LLVector4a::setMul(const LLVector4a& a, const LLVector4a& b)
  163. {
  164. mQ = _mm_mul_ps(a.mQ, b.mQ);
  165. }
  166. // Set this to element-wise quotient (a / b)
  167. inline void LLVector4a::setDiv(const LLVector4a& a, const LLVector4a& b)
  168. {
  169. mQ = _mm_div_ps( a.mQ, b.mQ );
  170. }
  171. // Set this to the element-wise absolute value of src
  172. inline void LLVector4a::setAbs(const LLVector4a& src)
  173. {
  174. static const LL_ALIGN_16(U32 F_ABS_MASK_4A[4]) = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
  175. mQ = _mm_and_ps(src.mQ, *reinterpret_cast<const LLQuad*>(F_ABS_MASK_4A));
  176. }
  177. // Add to each component in this vector the corresponding component in rhs
  178. inline void LLVector4a::add(const LLVector4a& rhs)
  179. {
  180. mQ = _mm_add_ps(mQ, rhs.mQ);
  181. }
  182. // Subtract from each component in this vector the corresponding component in rhs
  183. inline void LLVector4a::sub(const LLVector4a& rhs)
  184. {
  185. mQ = _mm_sub_ps(mQ, rhs.mQ);
  186. }
  187. // Multiply each component in this vector by the corresponding component in rhs
  188. inline void LLVector4a::mul(const LLVector4a& rhs)
  189. {
  190. mQ = _mm_mul_ps(mQ, rhs.mQ);
  191. }
  192. // Divide each component in this vector by the corresponding component in rhs
  193. inline void LLVector4a::div(const LLVector4a& rhs)
  194. {
  195. // TODO: Check accuracy, maybe add divFast
  196. mQ = _mm_div_ps(mQ, rhs.mQ);
  197. }
  198. // Multiply this vector by x in a scalar fashion
  199. inline void LLVector4a::mul(const F32 x)
  200. {
  201. LLVector4a t;
  202. t.splat(x);
  203. mQ = _mm_mul_ps(mQ, t.mQ);
  204. }
  205. // Set this to (a x b) (geometric cross-product)
  206. inline void LLVector4a::setCross3(const LLVector4a& a, const LLVector4a& b)
  207. {
  208. // Vectors are stored in memory in w, z, y, x order from high to low
  209. // Set vector1 = { a[W], a[X], a[Z], a[Y] }
  210. const LLQuad vector1 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
  211. // Set vector2 = { b[W], b[Y], b[X], b[Z] }
  212. const LLQuad vector2 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
  213. // mQ = { a[W]*b[W], a[X]*b[Y], a[Z]*b[X], a[Y]*b[Z] }
  214. mQ = _mm_mul_ps( vector1, vector2 );
  215. // vector3 = { a[W], a[Y], a[X], a[Z] }
  216. const LLQuad vector3 = _mm_shuffle_ps( a.mQ, a.mQ, _MM_SHUFFLE( 3, 1, 0, 2 ));
  217. // vector4 = { b[W], b[X], b[Z], b[Y] }
  218. const LLQuad vector4 = _mm_shuffle_ps( b.mQ, b.mQ, _MM_SHUFFLE( 3, 0, 2, 1 ));
  219. // mQ = { 0, a[X]*b[Y] - a[Y]*b[X], a[Z]*b[X] - a[X]*b[Z], a[Y]*b[Z] - a[Z]*b[Y] }
  220. mQ = _mm_sub_ps( mQ, _mm_mul_ps( vector3, vector4 ));
  221. }
  222. /* This function works, but may be slightly slower than the one below on older machines
  223. inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
  224. {
  225. // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
  226. const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
  227. // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
  228. const LLQuad wzxy = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE(3, 2, 0, 1 ));
  229. // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  230. const LLQuad xPlusY = _mm_add_ps(ab, wzxy);
  231. // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  232. const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
  233. // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
  234. const LLQuad zSplat = _mm_shuffle_ps( ab, ab, _MM_SHUFFLE( 2, 2, 2, 2 ));
  235. // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
  236. mQ = _mm_add_ps(zSplat, xPlusYSplat);
  237. }*/
  238. // Set all elements to the dot product of the x, y, and z elements in a and b
  239. inline void LLVector4a::setAllDot3(const LLVector4a& a, const LLVector4a& b)
  240. {
  241. // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
  242. const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
  243. // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
  244. const __m128i wzxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(3, 2, 0, 1 ));
  245. // xPlusY = { 2*a[W]*b[W], 2 * a[Z] * b[Z], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  246. const LLQuad xPlusY = _mm_add_ps(ab, _mm_castsi128_ps(wzxy));
  247. // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  248. const LLQuad xPlusYSplat = _mm_movelh_ps(xPlusY, xPlusY);
  249. // zSplat = { a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z], a[Z]*b[Z] }
  250. const __m128i zSplat = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE( 2, 2, 2, 2 ));
  251. // mQ = { a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
  252. mQ = _mm_add_ps(_mm_castsi128_ps(zSplat), xPlusYSplat);
  253. }
  254. // Set all elements to the dot product of the x, y, z, and w elements in a and b
  255. inline void LLVector4a::setAllDot4(const LLVector4a& a, const LLVector4a& b)
  256. {
  257. // ab = { a[W]*b[W], a[Z]*b[Z], a[Y]*b[Y], a[X]*b[X] }
  258. const LLQuad ab = _mm_mul_ps( a.mQ, b.mQ );
  259. // yzxw = { a[W]*b[W], a[Z]*b[Z], a[X]*b[X], a[Y]*b[Y] }
  260. const __m128i zwxy = _mm_shuffle_epi32(_mm_castps_si128(ab), _MM_SHUFFLE(2, 3, 0, 1 ));
  261. // zPlusWandXplusY = { a[W]*b[W] + a[Z]*b[Z], a[Z] * b[Z] + a[W]*b[W], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  262. const LLQuad zPlusWandXplusY = _mm_add_ps(ab, _mm_castsi128_ps(zwxy));
  263. // xPlusYSplat = { a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y], a[Y]*b[Y] + a[X] * b[X], a[X] * b[X] + a[Y] * b[Y] }
  264. const LLQuad xPlusYSplat = _mm_movelh_ps(zPlusWandXplusY, zPlusWandXplusY);
  265. const LLQuad zPlusWSplat = _mm_movehl_ps(zPlusWandXplusY, zPlusWandXplusY);
  266. // mQ = { a[W]*b[W] + a[Z] * b[Z] + a[Y] * b[Y] + a[X] * b[X], same, same, same }
  267. mQ = _mm_add_ps(xPlusYSplat, zPlusWSplat);
  268. }
  269. // Return the 3D dot product of this vector and b
  270. inline LLSimdScalar LLVector4a::dot3(const LLVector4a& b) const
  271. {
  272. const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
  273. const LLQuad splatY = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(1, 1, 1, 1) ) );
  274. const LLQuad splatZ = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128(ab), _MM_SHUFFLE(2, 2, 2, 2) ) );
  275. const LLQuad xPlusY = _mm_add_ps( ab, splatY );
  276. return _mm_add_ps( xPlusY, splatZ );
  277. }
  278. // Return the 4D dot product of this vector and b
  279. inline LLSimdScalar LLVector4a::dot4(const LLVector4a& b) const
  280. {
  281. // ab = { w, z, y, x }
  282. const LLQuad ab = _mm_mul_ps( mQ, b.mQ );
  283. // upperProdsInLowerElems = { y, x, y, x }
  284. const LLQuad upperProdsInLowerElems = _mm_movehl_ps( ab, ab );
  285. // sumOfPairs = { w+y, z+x, 2y, 2x }
  286. const LLQuad sumOfPairs = _mm_add_ps( upperProdsInLowerElems, ab );
  287. // shuffled = { z+x, z+x, z+x, z+x }
  288. const LLQuad shuffled = _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( sumOfPairs ), _MM_SHUFFLE(1, 1, 1, 1) ) );
  289. return _mm_add_ss( sumOfPairs, shuffled );
  290. }
  291. // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
  292. // Note that this does not consider zero length vectors!
  293. inline void LLVector4a::normalize3()
  294. {
  295. // lenSqrd = a dot a
  296. LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
  297. // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
  298. const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
  299. static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
  300. static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
  301. // Now we do one round of Newton-Raphson approximation to get full accuracy
  302. // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
  303. // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
  304. // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
  305. // = 0.5 * w * (3 - a*w^2)
  306. // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
  307. // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
  308. const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
  309. const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
  310. const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
  311. const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
  312. mQ = _mm_mul_ps( mQ, nrApprox );
  313. }
  314. // Normalize this vector with respect to all components. Accurate to 22 bites of precision.
  315. // Note that this does not consider zero length vectors!
  316. inline void LLVector4a::normalize4()
  317. {
  318. // lenSqrd = a dot a
  319. LLVector4a lenSqrd; lenSqrd.setAllDot4( *this, *this );
  320. // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
  321. const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
  322. static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
  323. static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
  324. // Now we do one round of Newton-Raphson approximation to get full accuracy
  325. // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
  326. // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
  327. // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
  328. // = 0.5 * w * (3 - a*w^2)
  329. // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
  330. // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
  331. const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
  332. const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
  333. const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
  334. const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
  335. mQ = _mm_mul_ps( mQ, nrApprox );
  336. }
  337. // Normalize this vector with respect to the x, y, and z components only. Accurate to 22 bites of precision. W component is destroyed
  338. // Note that this does not consider zero length vectors!
  339. inline LLSimdScalar LLVector4a::normalize3withLength()
  340. {
  341. // lenSqrd = a dot a
  342. LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
  343. // rsqrt = approximate reciprocal square (i.e., { ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2, ~1/len(a)^2 }
  344. const LLQuad rsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
  345. static const LLQuad half = { 0.5f, 0.5f, 0.5f, 0.5f };
  346. static const LLQuad three = {3.f, 3.f, 3.f, 3.f };
  347. // Now we do one round of Newton-Raphson approximation to get full accuracy
  348. // According to the Newton-Raphson method, given a first 'w' for the root of f(x) = 1/x^2 - a (i.e., x = 1/sqrt(a))
  349. // the next better approximation w[i+1] = w - f(w)/f'(w) = w - (1/w^2 - a)/(-2*w^(-3))
  350. // w[i+1] = w + 0.5 * (1/w^2 - a) * w^3 = w + 0.5 * (w - a*w^3) = 1.5 * w - 0.5 * a * w^3
  351. // = 0.5 * w * (3 - a*w^2)
  352. // Our first approx is w = rsqrt. We need out = a * w[i+1] (this is the input vector 'a', not the 'a' from the above formula
  353. // which is actually lenSqrd). So out = a * [0.5*rsqrt * (3 - lenSqrd*rsqrt*rsqrt)]
  354. const LLQuad AtimesRsqrt = _mm_mul_ps( lenSqrd.mQ, rsqrt );
  355. const LLQuad AtimesRsqrtTimesRsqrt = _mm_mul_ps( AtimesRsqrt, rsqrt );
  356. const LLQuad threeMinusAtimesRsqrtTimesRsqrt = _mm_sub_ps(three, AtimesRsqrtTimesRsqrt );
  357. const LLQuad nrApprox = _mm_mul_ps(half, _mm_mul_ps(rsqrt, threeMinusAtimesRsqrtTimesRsqrt));
  358. mQ = _mm_mul_ps( mQ, nrApprox );
  359. return _mm_sqrt_ss(lenSqrd);
  360. }
  361. // Normalize this vector with respect to the x, y, and z components only. Accurate only to 10-12 bits of precision. W component is destroyed
  362. // Note that this does not consider zero length vectors!
  363. inline void LLVector4a::normalize3fast()
  364. {
  365. LLVector4a lenSqrd; lenSqrd.setAllDot3( *this, *this );
  366. const LLQuad approxRsqrt = _mm_rsqrt_ps(lenSqrd.mQ);
  367. mQ = _mm_mul_ps( mQ, approxRsqrt );
  368. }
  369. // Return true if this vector is normalized with respect to x,y,z up to tolerance
  370. inline LLBool32 LLVector4a::isNormalized3( F32 tolerance ) const
  371. {
  372. static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
  373. LLSimdScalar tol = _mm_load_ss( &tolerance );
  374. tol = _mm_mul_ss( tol, tol );
  375. LLVector4a lenSquared; lenSquared.setAllDot3( *this, *this );
  376. lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
  377. lenSquared.setAbs(lenSquared);
  378. return _mm_comile_ss( lenSquared, tol );
  379. }
  380. // Return true if this vector is normalized with respect to all components up to tolerance
  381. inline LLBool32 LLVector4a::isNormalized4( F32 tolerance ) const
  382. {
  383. static LL_ALIGN_16(const U32 ones[4]) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
  384. LLSimdScalar tol = _mm_load_ss( &tolerance );
  385. tol = _mm_mul_ss( tol, tol );
  386. LLVector4a lenSquared; lenSquared.setAllDot4( *this, *this );
  387. lenSquared.sub( *reinterpret_cast<const LLVector4a*>(ones) );
  388. lenSquared.setAbs(lenSquared);
  389. return _mm_comile_ss( lenSquared, tol );
  390. }
  391. // Set all elements to the length of vector 'v'
  392. inline void LLVector4a::setAllLength3( const LLVector4a& v )
  393. {
  394. LLVector4a lenSqrd;
  395. lenSqrd.setAllDot3(v, v);
  396. mQ = _mm_sqrt_ps(lenSqrd.mQ);
  397. }
  398. // Get this vector's length
  399. inline LLSimdScalar LLVector4a::getLength3() const
  400. {
  401. return _mm_sqrt_ss( dot3( (const LLVector4a)mQ ) );
  402. }
  403. // Set the components of this vector to the minimum of the corresponding components of lhs and rhs
  404. inline void LLVector4a::setMin(const LLVector4a& lhs, const LLVector4a& rhs)
  405. {
  406. mQ = _mm_min_ps(lhs.mQ, rhs.mQ);
  407. }
  408. // Set the components of this vector to the maximum of the corresponding components of lhs and rhs
  409. inline void LLVector4a::setMax(const LLVector4a& lhs, const LLVector4a& rhs)
  410. {
  411. mQ = _mm_max_ps(lhs.mQ, rhs.mQ);
  412. }
  413. // Set this to (c * lhs) + rhs * ( 1 - c)
  414. inline void LLVector4a::setLerp(const LLVector4a& lhs, const LLVector4a& rhs, F32 c)
  415. {
  416. LLVector4a a = lhs;
  417. a.mul(c);
  418. LLVector4a b = rhs;
  419. b.mul(1.f-c);
  420. setAdd(a, b);
  421. }
  422. inline LLBool32 LLVector4a::isFinite3() const
  423. {
  424. static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
  425. const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
  426. const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
  427. const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
  428. return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZ );
  429. }
  430. inline LLBool32 LLVector4a::isFinite4() const
  431. {
  432. static LL_ALIGN_16(const U32 nanOrInfMask[4]) = { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
  433. const __m128i nanOrInfMaskV = *reinterpret_cast<const __m128i*> (nanOrInfMask);
  434. const __m128i maskResult = _mm_and_si128( _mm_castps_si128(mQ), nanOrInfMaskV );
  435. const LLVector4Logical equalityCheck = _mm_castsi128_ps(_mm_cmpeq_epi32( maskResult, nanOrInfMaskV ));
  436. return !equalityCheck.areAnySet( LLVector4Logical::MASK_XYZW );
  437. }
  438. inline void LLVector4a::setRotatedInv( const LLRotation& rot, const LLVector4a& vec )
  439. {
  440. LLRotation inv; inv.setTranspose( rot );
  441. setRotated( inv, vec );
  442. }
  443. inline void LLVector4a::setRotatedInv( const LLQuaternion2& quat, const LLVector4a& vec )
  444. {
  445. LLQuaternion2 invRot; invRot.setConjugate( quat );
  446. setRotated(invRot, vec);
  447. }
  448. inline void LLVector4a::clamp( const LLVector4a& low, const LLVector4a& high )
  449. {
  450. const LLVector4Logical highMask = greaterThan( high );
  451. const LLVector4Logical lowMask = lessThan( low );
  452. setSelectWithMask( highMask, high, *this );
  453. setSelectWithMask( lowMask, low, *this );
  454. }
  455. ////////////////////////////////////
  456. // LOGICAL
  457. ////////////////////////////////////
  458. // The functions in this section will compare the elements in this vector
  459. // to those in rhs and return an LLVector4Logical with all bits set in elements
  460. // where the comparison was true and all bits unset in elements where the comparison
  461. // was false. See llvector4logica.h
  462. ////////////////////////////////////
  463. // WARNING: Other than equals3 and equals4, these functions do NOT account
  464. // for floating point tolerance. You should include the appropriate tolerance
  465. // in the inputs.
  466. ////////////////////////////////////
  467. inline LLVector4Logical LLVector4a::greaterThan(const LLVector4a& rhs) const
  468. {
  469. return _mm_cmpgt_ps(mQ, rhs.mQ);
  470. }
  471. inline LLVector4Logical LLVector4a::lessThan(const LLVector4a& rhs) const
  472. {
  473. return _mm_cmplt_ps(mQ, rhs.mQ);
  474. }
  475. inline LLVector4Logical LLVector4a::greaterEqual(const LLVector4a& rhs) const
  476. {
  477. return _mm_cmpge_ps(mQ, rhs.mQ);
  478. }
  479. inline LLVector4Logical LLVector4a::lessEqual(const LLVector4a& rhs) const
  480. {
  481. return _mm_cmple_ps(mQ, rhs.mQ);
  482. }
  483. inline LLVector4Logical LLVector4a::equal(const LLVector4a& rhs) const
  484. {
  485. return _mm_cmpeq_ps(mQ, rhs.mQ);
  486. }
  487. // Returns true if this and rhs are componentwise equal up to the specified absolute tolerance
  488. inline bool LLVector4a::equals4(const LLVector4a& rhs, F32 tolerance ) const
  489. {
  490. LLVector4a diff; diff.setSub( *this, rhs );
  491. diff.setAbs( diff );
  492. const LLQuad tol = _mm_set1_ps( tolerance );
  493. const LLQuad cmp = _mm_cmplt_ps( diff, tol );
  494. return (_mm_movemask_ps( cmp ) & LLVector4Logical::MASK_XYZW) == LLVector4Logical::MASK_XYZW;
  495. }
  496. inline bool LLVector4a::equals3(const LLVector4a& rhs, F32 tolerance ) const
  497. {
  498. LLVector4a diff; diff.setSub( *this, rhs );
  499. diff.setAbs( diff );
  500. const LLQuad tol = _mm_set1_ps( tolerance );
  501. const LLQuad t = _mm_cmplt_ps( diff, tol );
  502. return (_mm_movemask_ps( t ) & LLVector4Logical::MASK_XYZ) == LLVector4Logical::MASK_XYZ;
  503. }
  504. ////////////////////////////////////
  505. // OPERATORS
  506. ////////////////////////////////////
  507. // Do NOT add aditional operators without consulting someone with SSE experience
  508. inline const LLVector4a& LLVector4a::operator= ( const LLVector4a& rhs )
  509. {
  510. mQ = rhs.mQ;
  511. return *this;
  512. }
  513. inline const LLVector4a& LLVector4a::operator= ( const LLQuad& rhs )
  514. {
  515. mQ = rhs;
  516. return *this;
  517. }
  518. inline LLVector4a::operator LLQuad() const
  519. {
  520. return mQ;
  521. }