/external/pysoundtouch14/libsoundtouch/sse_optimized.cpp

http://echo-nest-remix.googlecode.com/ · C++ · 492 lines · 103 code · 57 blank · 332 comment · 13 complexity · 126ceafc3a0e01b37165147c89fc4622 MD5 · raw file

  1. ////////////////////////////////////////////////////////////////////////////////
  2. ///
  3. /// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE
  4. /// optimized functions have been gathered into this single source
  5. /// code file, regardless to their class or original source code file, in order
  6. /// to ease porting the library to other compiler and processor platforms.
  7. ///
  8. /// The SSE-optimizations are programmed using SSE compiler intrinsics that
  9. /// are supported both by Microsoft Visual C++ and GCC compilers, so this file
  10. /// should compile with both toolsets.
  11. ///
  12. /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
  13. /// 6.0 processor pack" update to support SSE instruction set. The update is
  14. /// available for download at Microsoft Developers Network, see here:
  15. /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
  16. ///
  17. /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
  18. /// perform a search with keywords "processor pack".
  19. ///
  20. /// Author : Copyright (c) Olli Parviainen
  21. /// Author e-mail : oparviai 'at' iki.fi
  22. /// SoundTouch WWW: http://www.surina.net/soundtouch
  23. ///
  24. ////////////////////////////////////////////////////////////////////////////////
  25. //
  26. // Last changed : $Date: 2009-01-25 16:13:39 +0200 (Sun, 25 Jan 2009) $
  27. // File revision : $Revision: 4 $
  28. //
  29. // $Id: sse_optimized.cpp 51 2009-01-25 14:13:39Z oparviai $
  30. //
  31. ////////////////////////////////////////////////////////////////////////////////
  32. //
  33. // License :
  34. //
  35. // SoundTouch audio processing library
  36. // Copyright (c) Olli Parviainen
  37. //
  38. // This library is free software; you can redistribute it and/or
  39. // modify it under the terms of the GNU Lesser General Public
  40. // License as published by the Free Software Foundation; either
  41. // version 2.1 of the License, or (at your option) any later version.
  42. //
  43. // This library is distributed in the hope that it will be useful,
  44. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  45. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  46. // Lesser General Public License for more details.
  47. //
  48. // You should have received a copy of the GNU Lesser General Public
  49. // License along with this library; if not, write to the Free Software
  50. // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  51. //
  52. ////////////////////////////////////////////////////////////////////////////////
  53. #include "cpu_detect.h"
  54. #include "STTypes.h"
  55. using namespace soundtouch;
  56. #ifdef ALLOW_SSE
  57. // SSE routines available only with float sample type
  58. //////////////////////////////////////////////////////////////////////////////
  59. //
  60. // implementation of SSE optimized functions of class 'TDStretchSSE'
  61. //
  62. //////////////////////////////////////////////////////////////////////////////
  63. #include "TDStretch.h"
  64. #include <xmmintrin.h>
  65. // Calculates cross correlation of two buffers
  66. double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
  67. {
  68. int i;
  69. float *pVec1;
  70. __m128 vSum, *pVec2;
  71. // Note. It means a major slow-down if the routine needs to tolerate
  72. // unaligned __m128 memory accesses. It's way faster if we can skip
  73. // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
  74. // This can mean up to ~ 10-fold difference (incl. part of which is
  75. // due to skipping every second round for stereo sound though).
  76. //
  77. // Compile-time define ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
  78. // for choosing if this little cheating is allowed.
  79. #ifdef ALLOW_NONEXACT_SIMD_OPTIMIZATION
  80. // Little cheating allowed, return valid correlation only for
  81. // aligned locations, meaning every second round for stereo sound.
  82. #define _MM_LOAD _mm_load_ps
  83. if (((ulong)pV1) & 15) return -1e50; // skip unaligned locations
  84. #else
  85. // No cheating allowed, use unaligned load & take the resulting
  86. // performance hit.
  87. #define _MM_LOAD _mm_loadu_ps
  88. #endif
  89. // ensure overlapLength is divisible by 8
  90. assert((overlapLength % 8) == 0);
  91. // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
  92. // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
  93. pVec1 = (float*)pV1;
  94. pVec2 = (__m128*)pV2;
  95. vSum = _mm_setzero_ps();
  96. // Unroll the loop by factor of 4 * 4 operations
  97. for (i = 0; i < overlapLength / 8; i ++)
  98. {
  99. // vSum += pV1[0..3] * pV2[0..3]
  100. vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0]));
  101. // vSum += pV1[4..7] * pV2[4..7]
  102. vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1]));
  103. // vSum += pV1[8..11] * pV2[8..11]
  104. vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2]));
  105. // vSum += pV1[12..15] * pV2[12..15]
  106. vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3]));
  107. pVec1 += 16;
  108. pVec2 += 4;
  109. }
  110. // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
  111. float *pvSum = (float*)&vSum;
  112. return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
  113. /* This is approximately corresponding routine in C-language:
  114. double corr;
  115. uint i;
  116. // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
  117. corr = 0.0;
  118. for (i = 0; i < overlapLength / 8; i ++)
  119. {
  120. corr += pV1[0] * pV2[0] +
  121. pV1[1] * pV2[1] +
  122. pV1[2] * pV2[2] +
  123. pV1[3] * pV2[3] +
  124. pV1[4] * pV2[4] +
  125. pV1[5] * pV2[5] +
  126. pV1[6] * pV2[6] +
  127. pV1[7] * pV2[7] +
  128. pV1[8] * pV2[8] +
  129. pV1[9] * pV2[9] +
  130. pV1[10] * pV2[10] +
  131. pV1[11] * pV2[11] +
  132. pV1[12] * pV2[12] +
  133. pV1[13] * pV2[13] +
  134. pV1[14] * pV2[14] +
  135. pV1[15] * pV2[15];
  136. pV1 += 16;
  137. pV2 += 16;
  138. }
  139. */
  140. /* This is corresponding routine in assembler. This may be teeny-weeny bit faster
  141. than intrinsic version, but more difficult to maintain & get compiled on multiple
  142. platforms.
  143. uint overlapLengthLocal = overlapLength;
  144. float corr;
  145. _asm
  146. {
  147. // Very important note: data in 'pV2' _must_ be aligned to
  148. // 16-byte boundary!
  149. // give prefetch hints to CPU of what data are to be needed soonish
  150. // give more aggressive hints on pV1 as that changes while pV2 stays
  151. // same between runs
  152. prefetcht0 [pV1]
  153. prefetcht0 [pV2]
  154. prefetcht0 [pV1 + 32]
  155. mov eax, dword ptr pV1
  156. mov ebx, dword ptr pV2
  157. xorps xmm0, xmm0
  158. mov ecx, overlapLengthLocal
  159. shr ecx, 3 // div by eight
  160. loop1:
  161. prefetcht0 [eax + 64] // give a prefetch hint to CPU what data are to be needed soonish
  162. prefetcht0 [ebx + 32] // give a prefetch hint to CPU what data are to be needed soonish
  163. movups xmm1, [eax]
  164. mulps xmm1, [ebx]
  165. addps xmm0, xmm1
  166. movups xmm2, [eax + 16]
  167. mulps xmm2, [ebx + 16]
  168. addps xmm0, xmm2
  169. prefetcht0 [eax + 96] // give a prefetch hint to CPU what data are to be needed soonish
  170. prefetcht0 [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish
  171. movups xmm3, [eax + 32]
  172. mulps xmm3, [ebx + 32]
  173. addps xmm0, xmm3
  174. movups xmm4, [eax + 48]
  175. mulps xmm4, [ebx + 48]
  176. addps xmm0, xmm4
  177. add eax, 64
  178. add ebx, 64
  179. dec ecx
  180. jnz loop1
  181. // add the four floats of xmm0 together and return the result.
  182. movhlps xmm1, xmm0 // move 3 & 4 of xmm0 to 1 & 2 of xmm1
  183. addps xmm1, xmm0
  184. movaps xmm2, xmm1
  185. shufps xmm2, xmm2, 0x01 // move 2 of xmm2 as 1 of xmm2
  186. addss xmm2, xmm1
  187. movss corr, xmm2
  188. }
  189. return (double)corr;
  190. */
  191. }
  192. //////////////////////////////////////////////////////////////////////////////
  193. //
  194. // implementation of SSE optimized functions of class 'FIRFilter'
  195. //
  196. //////////////////////////////////////////////////////////////////////////////
  197. #include "FIRFilter.h"
  198. FIRFilterSSE::FIRFilterSSE() : FIRFilter()
  199. {
  200. filterCoeffsAlign = NULL;
  201. filterCoeffsUnalign = NULL;
  202. }
  203. FIRFilterSSE::~FIRFilterSSE()
  204. {
  205. delete[] filterCoeffsUnalign;
  206. filterCoeffsAlign = NULL;
  207. filterCoeffsUnalign = NULL;
  208. }
  209. // (overloaded) Calculates filter coefficients for SSE routine
  210. void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
  211. {
  212. uint i;
  213. float fDivider;
  214. FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
  215. // Scale the filter coefficients so that it won't be necessary to scale the filtering result
  216. // also rearrange coefficients suitably for 3DNow!
  217. // Ensure that filter coeffs array is aligned to 16-byte boundary
  218. delete[] filterCoeffsUnalign;
  219. filterCoeffsUnalign = new float[2 * newLength + 4];
  220. filterCoeffsAlign = (float *)(((unsigned long)filterCoeffsUnalign + 15) & (ulong)-16);
  221. fDivider = (float)resultDivider;
  222. // rearrange the filter coefficients for mmx routines
  223. for (i = 0; i < newLength; i ++)
  224. {
  225. filterCoeffsAlign[2 * i + 0] =
  226. filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
  227. }
  228. }
  229. // SSE-optimized version of the filter routine for stereo sound
  230. uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
  231. {
  232. int count = (int)((numSamples - length) & (uint)-2);
  233. int j;
  234. assert(count % 2 == 0);
  235. if (count < 2) return 0;
  236. assert(source != NULL);
  237. assert(dest != NULL);
  238. assert((length % 8) == 0);
  239. assert(filterCoeffsAlign != NULL);
  240. assert(((ulong)filterCoeffsAlign) % 16 == 0);
  241. // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
  242. for (j = 0; j < count; j += 2)
  243. {
  244. float *pSrc;
  245. const __m128 *pFil;
  246. __m128 sum1, sum2;
  247. uint i;
  248. pSrc = (float*)source; // source audio data
  249. pFil = (__m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients
  250. // are aligned to 16-byte boundary
  251. sum1 = sum2 = _mm_setzero_ps();
  252. for (i = 0; i < length / 8; i ++)
  253. {
  254. // Unroll loop for efficiency & calculate filter for 2*2 stereo samples
  255. // at each pass
  256. // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
  257. // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
  258. sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc) , pFil[0]));
  259. sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
  260. sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
  261. sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
  262. sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) , pFil[2]));
  263. sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
  264. sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
  265. sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
  266. pSrc += 16;
  267. pFil += 4;
  268. }
  269. // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
  270. // to sum the two hi- and lo-floats of these registers together.
  271. // post-shuffle & add the filtered values and store to dest.
  272. _mm_storeu_ps(dest, _mm_add_ps(
  273. _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2
  274. _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0
  275. ));
  276. source += 4;
  277. dest += 4;
  278. }
  279. // Ideas for further improvement:
  280. // 1. If it could be guaranteed that 'source' were always aligned to 16-byte
  281. // boundary, a faster aligned '_mm_load_ps' instruction could be used.
  282. // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte
  283. // boundary, a faster '_mm_store_ps' instruction could be used.
  284. return (uint)count;
  285. /* original routine in C-language. please notice the C-version has differently
  286. organized coefficients though.
  287. double suml1, suml2;
  288. double sumr1, sumr2;
  289. uint i, j;
  290. for (j = 0; j < count; j += 2)
  291. {
  292. const float *ptr;
  293. const float *pFil;
  294. suml1 = sumr1 = 0.0;
  295. suml2 = sumr2 = 0.0;
  296. ptr = src;
  297. pFil = filterCoeffs;
  298. for (i = 0; i < lengthLocal; i ++)
  299. {
  300. // unroll loop for efficiency.
  301. suml1 += ptr[0] * pFil[0] +
  302. ptr[2] * pFil[2] +
  303. ptr[4] * pFil[4] +
  304. ptr[6] * pFil[6];
  305. sumr1 += ptr[1] * pFil[1] +
  306. ptr[3] * pFil[3] +
  307. ptr[5] * pFil[5] +
  308. ptr[7] * pFil[7];
  309. suml2 += ptr[8] * pFil[0] +
  310. ptr[10] * pFil[2] +
  311. ptr[12] * pFil[4] +
  312. ptr[14] * pFil[6];
  313. sumr2 += ptr[9] * pFil[1] +
  314. ptr[11] * pFil[3] +
  315. ptr[13] * pFil[5] +
  316. ptr[15] * pFil[7];
  317. ptr += 16;
  318. pFil += 8;
  319. }
  320. dest[0] = (float)suml1;
  321. dest[1] = (float)sumr1;
  322. dest[2] = (float)suml2;
  323. dest[3] = (float)sumr2;
  324. src += 4;
  325. dest += 4;
  326. }
  327. */
  328. /* Similar routine in assembly, again obsoleted due to maintainability
  329. _asm
  330. {
  331. // Very important note: data in 'src' _must_ be aligned to
  332. // 16-byte boundary!
  333. mov edx, count
  334. mov ebx, dword ptr src
  335. mov eax, dword ptr dest
  336. shr edx, 1
  337. loop1:
  338. // "outer loop" : during each round 2*2 output samples are calculated
  339. // give prefetch hints to CPU of what data are to be needed soonish
  340. prefetcht0 [ebx]
  341. prefetcht0 [filterCoeffsLocal]
  342. mov esi, ebx
  343. mov edi, filterCoeffsLocal
  344. xorps xmm0, xmm0
  345. xorps xmm1, xmm1
  346. mov ecx, lengthLocal
  347. loop2:
  348. // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
  349. prefetcht0 [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish
  350. prefetcht0 [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish
  351. movups xmm2, [esi] // possibly unaligned load
  352. movups xmm3, [esi + 8] // possibly unaligned load
  353. mulps xmm2, [edi]
  354. mulps xmm3, [edi]
  355. addps xmm0, xmm2
  356. addps xmm1, xmm3
  357. movups xmm4, [esi + 16] // possibly unaligned load
  358. movups xmm5, [esi + 24] // possibly unaligned load
  359. mulps xmm4, [edi + 16]
  360. mulps xmm5, [edi + 16]
  361. addps xmm0, xmm4
  362. addps xmm1, xmm5
  363. prefetcht0 [esi + 64] // give a prefetch hint to CPU what data are to be needed soonish
  364. prefetcht0 [edi + 64] // give a prefetch hint to CPU what data are to be needed soonish
  365. movups xmm6, [esi + 32] // possibly unaligned load
  366. movups xmm7, [esi + 40] // possibly unaligned load
  367. mulps xmm6, [edi + 32]
  368. mulps xmm7, [edi + 32]
  369. addps xmm0, xmm6
  370. addps xmm1, xmm7
  371. movups xmm4, [esi + 48] // possibly unaligned load
  372. movups xmm5, [esi + 56] // possibly unaligned load
  373. mulps xmm4, [edi + 48]
  374. mulps xmm5, [edi + 48]
  375. addps xmm0, xmm4
  376. addps xmm1, xmm5
  377. add esi, 64
  378. add edi, 64
  379. dec ecx
  380. jnz loop2
  381. // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
  382. // to sum the two hi- and lo-floats of these registers together.
  383. movhlps xmm2, xmm0 // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
  384. movlhps xmm2, xmm1 // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
  385. shufps xmm0, xmm1, 0xe4 // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
  386. addps xmm0, xmm2
  387. movaps [eax], xmm0
  388. add ebx, 16
  389. add eax, 16
  390. dec edx
  391. jnz loop1
  392. }
  393. */
  394. }
  395. #endif // ALLOW_SSE