PageRenderTime 38ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/indra/llmath/llvector4a.cpp

https://bitbucket.org/lindenlab/viewer-beta/
C++ | 222 lines | 139 code | 36 blank | 47 comment | 6 complexity | 0fc17e1e87808c3ae4d9ef389995d856 MD5 | raw file
Possible License(s): LGPL-2.1
  1. /**
  2. * @file llvector4a.cpp
  3. * @brief SIMD vector implementation
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewerlgpl$
  6. * Second Life Viewer Source Code
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * This library is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation;
  12. * version 2.1 of the License only.
  13. *
  14. * This library is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with this library; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. *
  23. * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
  24. * $/LicenseInfo$
  25. */
  26. #include "llmath.h"
  27. #include "llquantize.h"
  28. extern const LLQuad F_ZERO_4A = { 0, 0, 0, 0 };
  29. extern const LLQuad F_APPROXIMATELY_ZERO_4A = {
  30. F_APPROXIMATELY_ZERO,
  31. F_APPROXIMATELY_ZERO,
  32. F_APPROXIMATELY_ZERO,
  33. F_APPROXIMATELY_ZERO
  34. };
  35. extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A );
  36. extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A );
  37. /*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
  38. {
  39. assert(src != NULL);
  40. assert(dst != NULL);
  41. assert(bytes > 0);
  42. assert((bytes % sizeof(F32))== 0);
  43. F32* end = dst + (bytes / sizeof(F32) );
  44. if (bytes > 64)
  45. {
  46. F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
  47. //at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
  48. F32* end_64 = end-16;
  49. _mm_prefetch((char*)begin_64, _MM_HINT_NTA);
  50. _mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
  51. _mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
  52. _mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
  53. while (dst < begin_64)
  54. {
  55. copy4a(dst, src);
  56. dst += 4;
  57. src += 4;
  58. }
  59. while (dst < end_64)
  60. {
  61. _mm_prefetch((char*)src + 512, _MM_HINT_NTA);
  62. _mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
  63. copy4a(dst, src);
  64. copy4a(dst+4, src+4);
  65. copy4a(dst+8, src+8);
  66. copy4a(dst+12, src+12);
  67. dst += 16;
  68. src += 16;
  69. }
  70. }
  71. while (dst < end)
  72. {
  73. copy4a(dst, src);
  74. dst += 4;
  75. src += 4;
  76. }
  77. }
  78. void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )
  79. {
  80. const LLVector4a col0 = rot.getColumn(0);
  81. const LLVector4a col1 = rot.getColumn(1);
  82. const LLVector4a col2 = rot.getColumn(2);
  83. LLVector4a result = _mm_load_ss( vec.getF32ptr() );
  84. result.splat<0>( result );
  85. result.mul( col0 );
  86. {
  87. LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() + 1 );
  88. yyyy.splat<0>( yyyy );
  89. yyyy.mul( col1 );
  90. result.add( yyyy );
  91. }
  92. {
  93. LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() + 2 );
  94. zzzz.splat<0>( zzzz );
  95. zzzz.mul( col2 );
  96. result.add( zzzz );
  97. }
  98. *this = result;
  99. }
  100. void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec )
  101. {
  102. const LLVector4a& quatVec = quat.getVector4a();
  103. LLVector4a temp; temp.setCross3(quatVec, vec);
  104. temp.add( temp );
  105. const LLVector4a realPart( quatVec.getScalarAt<3>() );
  106. LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart );
  107. mQ = vec;
  108. add( tempTimesReal );
  109. LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp );
  110. add(imagCrossTemp);
  111. }
  112. void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high )
  113. {
  114. LLVector4a val(mQ);
  115. LLVector4a delta; delta.setSub( high, low );
  116. {
  117. val.clamp(low, high);
  118. val.sub(low);
  119. // 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
  120. const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
  121. // {
  122. // static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
  123. // LLVector4a two; two.load4a( F_TWO_4A );
  124. //
  125. // // Here we use _mm_rcp_ps plus one round of newton-raphson
  126. // // We wish to find 'x' such that x = 1/delta
  127. // // As a first approximation, we take x0 = _mm_rcp_ps(delta)
  128. // // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
  129. // // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
  130. // const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
  131. // oneOverDelta.setMul( delta, recipApprox );
  132. // oneOverDelta.setSub( two, oneOverDelta );
  133. // oneOverDelta.mul( recipApprox );
  134. // }
  135. val.mul(oneOverDelta);
  136. val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
  137. }
  138. val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
  139. {
  140. val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
  141. val.mul(delta);
  142. val.add(low);
  143. }
  144. {
  145. LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
  146. LLVector4a absVal; absVal.setAbs( val );
  147. setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
  148. }
  149. }
  150. void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
  151. {
  152. LLVector4a val(mQ);
  153. LLVector4a delta; delta.setSub( high, low );
  154. {
  155. val.clamp(low, high);
  156. val.sub(low);
  157. // 16-bit quantization means we need a round of Newton-Raphson
  158. LLVector4a oneOverDelta;
  159. {
  160. static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
  161. LLVector4a two; two.load4a( F_TWO_4A );
  162. // Here we use _mm_rcp_ps plus one round of newton-raphson
  163. // We wish to find 'x' such that x = 1/delta
  164. // As a first approximation, we take x0 = _mm_rcp_ps(delta)
  165. // Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
  166. // See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
  167. const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
  168. oneOverDelta.setMul( delta, recipApprox );
  169. oneOverDelta.setSub( two, oneOverDelta );
  170. oneOverDelta.mul( recipApprox );
  171. }
  172. val.mul(oneOverDelta);
  173. val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
  174. }
  175. val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
  176. {
  177. val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
  178. val.mul(delta);
  179. val.add(low);
  180. }
  181. {
  182. LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
  183. LLVector4a absVal; absVal.setAbs( val );
  184. setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
  185. }
  186. }