PageRenderTime 67ms CodeModel.GetById 26ms app.highlight 37ms RepoModel.GetById 1ms app.codeStats 1ms

/indra/llmath/llmatrix3a.cpp

https://bitbucket.org/lindenlab/viewer-beta/
C++ | 134 lines | 91 code | 18 blank | 25 comment | 3 complexity | da4b3f5c4b375d3110acff065537b603 MD5 | raw file
  1/** 
  2 * @file llvector4a.cpp
  3 * @brief SIMD vector implementation
  4 *
  5 * $LicenseInfo:firstyear=2010&license=viewerlgpl$
  6 * Second Life Viewer Source Code
  7 * Copyright (C) 2010, Linden Research, Inc.
  8 * 
  9 * This library is free software; you can redistribute it and/or
 10 * modify it under the terms of the GNU Lesser General Public
 11 * License as published by the Free Software Foundation;
 12 * version 2.1 of the License only.
 13 * 
 14 * This library is distributed in the hope that it will be useful,
 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 17 * Lesser General Public License for more details.
 18 * 
 19 * You should have received a copy of the GNU Lesser General Public
 20 * License along with this library; if not, write to the Free Software
 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 22 * 
 23 * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
 24 * $/LicenseInfo$
 25 */
 26
 27#include "llmath.h"
 28
 29static LL_ALIGN_16(const F32 M_IDENT_3A[12]) = 
 30												{	1.f, 0.f, 0.f, 0.f, // Column 1
 31													0.f, 1.f, 0.f, 0.f, // Column 2
 32													0.f, 0.f, 1.f, 0.f }; // Column 3
 33
 34extern const LLMatrix3a LL_M3A_IDENTITY = *reinterpret_cast<const LLMatrix3a*> (M_IDENT_3A);
 35
 36void LLMatrix3a::setMul( const LLMatrix3a& lhs, const LLMatrix3a& rhs )
 37{
 38	const LLVector4a col0 = lhs.getColumn(0);
 39	const LLVector4a col1 = lhs.getColumn(1);
 40	const LLVector4a col2 = lhs.getColumn(2);
 41
 42	for ( int i = 0; i < 3; i++ )
 43	{
 44		LLVector4a xxxx = _mm_load_ss( rhs.mColumns[i].getF32ptr() );
 45		xxxx.splat<0>( xxxx );
 46		xxxx.mul( col0 );
 47
 48		{
 49			LLVector4a yyyy = _mm_load_ss( rhs.mColumns[i].getF32ptr() +  1 );
 50			yyyy.splat<0>( yyyy );
 51			yyyy.mul( col1 ); 
 52			xxxx.add( yyyy );
 53		}
 54
 55		{
 56			LLVector4a zzzz = _mm_load_ss( rhs.mColumns[i].getF32ptr() +  2 );
 57			zzzz.splat<0>( zzzz );
 58			zzzz.mul( col2 );
 59			xxxx.add( zzzz );
 60		}
 61
 62		xxxx.store4a( mColumns[i].getF32ptr() );
 63	}
 64	
 65}
 66
 67/*static */void LLMatrix3a::batchTransform( const LLMatrix3a& xform, const LLVector4a* src, int numVectors, LLVector4a* dst )
 68{
 69	const LLVector4a col0 = xform.getColumn(0);
 70	const LLVector4a col1 = xform.getColumn(1);
 71	const LLVector4a col2 = xform.getColumn(2);
 72	const LLVector4a* maxAddr = src + numVectors;
 73
 74	if ( numVectors & 0x1 )
 75	{
 76		LLVector4a xxxx = _mm_load_ss( (const F32*)src );
 77		LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
 78		LLVector4a zzzz = _mm_load_ss( (const F32*)src + 2 );
 79		xxxx.splat<0>( xxxx );
 80		yyyy.splat<0>( yyyy );
 81		zzzz.splat<0>( zzzz );
 82		xxxx.mul( col0 );
 83		yyyy.mul( col1 ); 
 84		zzzz.mul( col2 );
 85		xxxx.add( yyyy );
 86		xxxx.add( zzzz );
 87		xxxx.store4a( (F32*)dst );
 88		src++;
 89		dst++;
 90	}
 91
 92
 93	numVectors >>= 1;
 94	while ( src < maxAddr )
 95	{
 96		_mm_prefetch( (const char*)(src + 32 ), _MM_HINT_NTA );
 97		_mm_prefetch( (const char*)(dst + 32), _MM_HINT_NTA );
 98		LLVector4a xxxx = _mm_load_ss( (const F32*)src );
 99		LLVector4a xxxx1= _mm_load_ss( (const F32*)(src + 1) );
100
101		xxxx.splat<0>( xxxx );
102		xxxx1.splat<0>( xxxx1 );
103		xxxx.mul( col0 );
104		xxxx1.mul( col0 );
105
106		{
107			LLVector4a yyyy = _mm_load_ss( (const F32*)src + 1 );
108			LLVector4a yyyy1 = _mm_load_ss( (const F32*)(src + 1) + 1);
109			yyyy.splat<0>( yyyy );
110			yyyy1.splat<0>( yyyy1 );
111			yyyy.mul( col1 );
112			yyyy1.mul( col1 );
113			xxxx.add( yyyy );
114			xxxx1.add( yyyy1 );
115		}
116
117		{
118			LLVector4a zzzz = _mm_load_ss( (const F32*)(src) + 2 );
119			LLVector4a zzzz1 = _mm_load_ss( (const F32*)(++src) + 2 );
120			zzzz.splat<0>( zzzz );
121			zzzz1.splat<0>( zzzz1 );
122			zzzz.mul( col2 );
123			zzzz1.mul( col2 );
124			xxxx.add( zzzz );
125			xxxx1.add( zzzz1 );
126		}
127
128		xxxx.store4a(dst->getF32ptr());
129		src++;
130		dst++;
131
132		xxxx1.store4a((F32*)dst++);
133	}
134}