PageRenderTime 27ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/indra/llmath/llmatrix3a.cpp

https://bitbucket.org/lindenlab/viewer-beta/
C++ | 134 lines | 91 code | 18 blank | 25 comment | 3 complexity | da4b3f5c4b375d3110acff065537b603 MD5 | raw file
Possible License(s): LGPL-2.1
  1. /**
  2. * @file llvector4a.cpp
  3. * @brief SIMD vector implementation
  4. *
  5. * $LicenseInfo:firstyear=2010&license=viewerlgpl$
  6. * Second Life Viewer Source Code
  7. * Copyright (C) 2010, Linden Research, Inc.
  8. *
  9. * This library is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation;
  12. * version 2.1 of the License only.
  13. *
  14. * This library is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with this library; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. *
  23. * Linden Research, Inc., 945 Battery Street, San Francisco, CA 94111 USA
  24. * $/LicenseInfo$
  25. */
  26. #include "llmath.h"
  27. static LL_ALIGN_16(const F32 M_IDENT_3A[12]) =
  28. { 1.f, 0.f, 0.f, 0.f, // Column 1
  29. 0.f, 1.f, 0.f, 0.f, // Column 2
  30. 0.f, 0.f, 1.f, 0.f }; // Column 3
  31. extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A);
  32. void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs )
  33. {
  34. const LLVector4a col0 = lhs.getColumn(0);
  35. const LLVector4a col1 = lhs.getColumn(1);
  36. const LLVector4a col2 = lhs.getColumn(2);
  37. for ( int i = 0; i < 3; i++ )
  38. {
  39. LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() );
  40. xxxx.splat<0>( xxxx );
  41. xxxx.mul( col0 );
  42. {
  43. LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 1 );
  44. yyyy.splat<0>( yyyy );
  45. yyyy.mul( col1 );
  46. xxxx.add( yyyy );
  47. }
  48. {
  49. LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() + 2 );
  50. zzzz.splat<0>( zzzz );
  51. zzzz.mul( col2 );
  52. xxxx.add( zzzz );
  53. }
  54. xxxx.store4a( mColumns[i].getF32ptr() );
  55. }
  56. }
  57. /*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst )
  58. {
  59. const LLVector4a col0 = xform.getColumn(0);
  60. const LLVector4a col1 = xform.getColumn(1);
  61. const LLVector4a col2 = xform.getColumn(2);
  62. const LLVector4a* maxAddr = src + numVectors;
  63. if ( numVectors & 0x1 )
  64. {
  65. LLVector4a xxxx = _mm_load_ss( (const F32*)src );
  66. LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
  67. LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 );
  68. xxxx.splat<0>( xxxx );
  69. yyyy.splat<0>( yyyy );
  70. zzzz.splat<0>( zzzz );
  71. xxxx.mul( col0 );
  72. yyyy.mul( col1 );
  73. zzzz.mul( col2 );
  74. xxxx.add( yyyy );
  75. xxxx.add( zzzz );
  76. xxxx.store4a( (F32*)dst );
  77. src++;
  78. dst++;
  79. }
  80. numVectors >>= 1;
  81. while ( src < maxAddr )
  82. {
  83. _mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA );
  84. _mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA );
  85. LLVector4a xxxx = _mm_load_ss( (const F32*)src );
  86. LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) );
  87. xxxx.splat<0>( xxxx );
  88. xxxx1.splat<0>( xxxx1 );
  89. xxxx.mul( col0 );
  90. xxxx1.mul( col0 );
  91. {
  92. LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
  93. LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1);
  94. yyyy.splat<0>( yyyy );
  95. yyyy1.splat<0>( yyyy1 );
  96. yyyy.mul( col1 );
  97. yyyy1.mul( col1 );
  98. xxxx.add( yyyy );
  99. xxxx1.add( yyyy1 );
  100. }
  101. {
  102. LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 );
  103. LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 );
  104. zzzz.splat<0>( zzzz );
  105. zzzz1.splat<0>( zzzz1 );
  106. zzzz.mul( col2 );
  107. zzzz1.mul( col2 );
  108. xxxx.add( zzzz );
  109. xxxx1.add( zzzz1 );
  110. }
  111. xxxx.store4a(dst->getF32ptr());
  112. src++;
  113. dst++;
  114. xxxx1.store4a((F32*)dst++);
  115. }
  116. }