PageRenderTime 38ms CodeModel.GetById 10ms app.highlight 24ms RepoModel.GetById 1ms app.codeStats 1ms

/indra/llmath/llvector4a.cpp

https://bitbucket.org/lindenlab/viewer-beta/
C++ | 222 lines | 139 code | 36 blank | 47 comment | 6 complexity | 0fc17e1e87808c3ae4d9ef389995d856 MD5 | raw file
  1/** 
  2 * @file llvector4a.cpp
  3 * @brief SIMD vector implementation
  4 *
  5 * $LicenseInfo:firstyear=2010&license=viewerlgpl$
  6 * Second Life Viewer Source Code
  7 * Copyright (C) 2010, Linden Research, Inc.
  8 * 
  9 * This library is free software; you can redistribute it and/or
 10 * modify it under the terms of the GNU Lesser General Public
 11 * License as published by the Free Software Foundation;
 12 * version 2.1 of the License only.
 13 * 
 14 * This library is distributed in the hope that it will be useful,
 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 17 * Lesser General Public License for more details.
 18 * 
 19 * You should have received a copy of the GNU Lesser General Public
 20 * License along with this library; if not, write to the Free Software
 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 22 * 
 23 * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
 24 * $/LicenseInfo$
 25 */
 26
 27#include "llmath.h"
 28#include "llquantize.h"
 29
 30extern const LLQuad F_ZERO_4A		= { 0, 0, 0, 0 };
 31extern const LLQuad F_APPROXIMATELY_ZERO_4A = { 
 32	F_APPROXIMATELY_ZERO,
 33	F_APPROXIMATELY_ZERO,
 34	F_APPROXIMATELY_ZERO,
 35	F_APPROXIMATELY_ZERO
 36};
 37
 38extern const LLVector4a LL_V4A_ZERO = reinterpret_cast<const LLVector4a&> ( F_ZERO_4A );
 39extern const LLVector4a LL_V4A_EPSILON = reinterpret_cast<const LLVector4a&> ( F_APPROXIMATELY_ZERO_4A );
 40
 41/*static */void LLVector4a::memcpyNonAliased16(F32* __restrict dst, const F32* __restrict src, size_t bytes)
 42{
 43	assert(src != NULL);
 44	assert(dst != NULL);
 45	assert(bytes > 0);
 46	assert((bytes % sizeof(F32))== 0); 
 47	
 48	F32* end = dst + (bytes / sizeof(F32) );
 49
 50	if (bytes > 64)
 51	{
 52		F32* begin_64 = LL_NEXT_ALIGNED_ADDRESS_64(dst);
 53		
 54		//at least 64 (16*4) bytes before the end of the destination, switch to 16 byte copies
 55		F32* end_64 = end-16;
 56		
 57		_mm_prefetch((char*)begin_64, _MM_HINT_NTA);
 58		_mm_prefetch((char*)begin_64 + 64, _MM_HINT_NTA);
 59		_mm_prefetch((char*)begin_64 + 128, _MM_HINT_NTA);
 60		_mm_prefetch((char*)begin_64 + 192, _MM_HINT_NTA);
 61		
 62		while (dst < begin_64)
 63		{
 64			copy4a(dst, src);
 65			dst += 4;
 66			src += 4;
 67		}
 68		
 69		while (dst < end_64)
 70		{
 71			_mm_prefetch((char*)src + 512, _MM_HINT_NTA);
 72			_mm_prefetch((char*)dst + 512, _MM_HINT_NTA);
 73			copy4a(dst, src);
 74			copy4a(dst+4, src+4);
 75			copy4a(dst+8, src+8);
 76			copy4a(dst+12, src+12);
 77			
 78			dst += 16;
 79			src += 16;
 80		}
 81	}
 82
 83	while (dst < end)
 84	{
 85		copy4a(dst, src);
 86		dst += 4;
 87		src += 4;
 88	}
 89}
 90
 91void LLVector4a::setRotated( const LLRotation& rot, const LLVector4a& vec )
 92{
 93	const LLVector4a col0 = rot.getColumn(0);
 94	const LLVector4a col1 = rot.getColumn(1);
 95	const LLVector4a col2 = rot.getColumn(2);
 96
 97	LLVector4a result = _mm_load_ss( vec.getF32ptr() );
 98	result.splat<0>( result );
 99	result.mul( col0 );
100
101	{
102		LLVector4a yyyy = _mm_load_ss( vec.getF32ptr() +  1 );
103		yyyy.splat<0>( yyyy );
104		yyyy.mul( col1 ); 
105		result.add( yyyy );
106	}
107
108	{
109		LLVector4a zzzz = _mm_load_ss( vec.getF32ptr() +  2 );
110		zzzz.splat<0>( zzzz );
111		zzzz.mul( col2 );
112		result.add( zzzz );
113	}
114
115	*this = result;
116}
117
118void LLVector4a::setRotated( const LLQuaternion2& quat, const LLVector4a& vec )
119{
120	const LLVector4a& quatVec = quat.getVector4a();
121	LLVector4a temp; temp.setCross3(quatVec, vec);
122	temp.add( temp );
123	
124	const LLVector4a realPart( quatVec.getScalarAt<3>() );
125	LLVector4a tempTimesReal; tempTimesReal.setMul( temp, realPart );
126
127	mQ = vec;
128	add( tempTimesReal );
129	
130	LLVector4a imagCrossTemp; imagCrossTemp.setCross3( quatVec, temp );
131	add(imagCrossTemp);
132}
133
134void LLVector4a::quantize8( const LLVector4a& low, const LLVector4a& high )
135{
136	LLVector4a val(mQ);
137	LLVector4a delta; delta.setSub( high, low );
138
139	{
140		val.clamp(low, high);
141		val.sub(low);
142
143		// 8-bit quantization means we can do with just 12 bits of reciprocal accuracy
144		const LLVector4a oneOverDelta = _mm_rcp_ps(delta.mQ);
145// 		{
146// 			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
147// 			LLVector4a two; two.load4a( F_TWO_4A );
148// 
149// 			// Here we use _mm_rcp_ps plus one round of newton-raphson
150// 			// We wish to find 'x' such that x = 1/delta
151// 			// As a first approximation, we take x0 = _mm_rcp_ps(delta)
152// 			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
153// 			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
154// 			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
155// 			oneOverDelta.setMul( delta, recipApprox );
156// 			oneOverDelta.setSub( two, oneOverDelta );
157// 			oneOverDelta.mul( recipApprox );
158// 		}
159
160		val.mul(oneOverDelta);
161		val.mul(*reinterpret_cast<const LLVector4a*>(F_U8MAX_4A));
162	}
163
164	val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
165
166	{
167		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
168		val.mul(delta);
169		val.add(low);
170	}
171
172	{
173		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU8MAX_4A));
174		LLVector4a absVal; absVal.setAbs( val );
175		setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
176	}	
177}
178
179void LLVector4a::quantize16( const LLVector4a& low, const LLVector4a& high )
180{
181	LLVector4a val(mQ);
182	LLVector4a delta; delta.setSub( high, low );
183
184	{
185		val.clamp(low, high);
186		val.sub(low);
187
188		// 16-bit quantization means we need a round of Newton-Raphson
189		LLVector4a oneOverDelta;
190		{
191			static LL_ALIGN_16( const F32 F_TWO_4A[4] ) = { 2.f, 2.f, 2.f, 2.f };
192			LLVector4a two; two.load4a( F_TWO_4A );
193
194			// Here we use _mm_rcp_ps plus one round of newton-raphson
195			// We wish to find 'x' such that x = 1/delta
196			// As a first approximation, we take x0 = _mm_rcp_ps(delta)
197			// Then x1 = 2 * x0 - a * x0^2 or x1 = x0 * ( 2 - a * x0 )
198			// See Intel AP-803 http://ompf.org/!/Intel_application_note_AP-803.pdf
199			const LLVector4a recipApprox = _mm_rcp_ps(delta.mQ);
200			oneOverDelta.setMul( delta, recipApprox );
201			oneOverDelta.setSub( two, oneOverDelta );
202			oneOverDelta.mul( recipApprox );
203		}
204
205		val.mul(oneOverDelta);
206		val.mul(*reinterpret_cast<const LLVector4a*>(F_U16MAX_4A));
207	}
208
209	val = _mm_cvtepi32_ps(_mm_cvtps_epi32( val.mQ ));
210
211	{
212		val.mul(*reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
213		val.mul(delta);
214		val.add(low);
215	}
216
217	{
218		LLVector4a maxError; maxError.setMul(delta, *reinterpret_cast<const LLVector4a*>(F_OOU16MAX_4A));
219		LLVector4a absVal; absVal.setAbs( val );
220		setSelectWithMask( absVal.lessThan( maxError ), F_ZERO_4A, val );
221	}	
222}