PageRenderTime 346ms CodeModel.GetById 171ms app.highlight 19ms RepoModel.GetById 151ms app.codeStats 1ms

/external/pysoundtouch14/libsoundtouch/sse_optimized.cpp

http://echo-nest-remix.googlecode.com/
C++ | 492 lines | 103 code | 57 blank | 332 comment | 13 complexity | 126ceafc3a0e01b37165147c89fc4622 MD5 | raw file
  1////////////////////////////////////////////////////////////////////////////////
  2///
  3/// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE 
  4/// optimized functions have been gathered into this single source 
  5/// code file, regardless to their class or original source code file, in order 
  6/// to ease porting the library to other compiler and processor platforms.
  7///
  8/// The SSE-optimizations are programmed using SSE compiler intrinsics that
  9/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
 10/// should compile with both toolsets.
 11///
 12/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
 13/// 6.0 processor pack" update to support SSE instruction set. The update is 
 14/// available for download at Microsoft Developers Network, see here:
 15/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
 16///
 17/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
 18/// perform a search with keywords "processor pack".
 19///
 20/// Author        : Copyright (c) Olli Parviainen
 21/// Author e-mail : oparviai 'at' iki.fi
 22/// SoundTouch WWW: http://www.surina.net/soundtouch
 23///
 24////////////////////////////////////////////////////////////////////////////////
 25//
 26// Last changed  : $Date: 2009-01-25 16:13:39 +0200 (Sun, 25 Jan 2009) $
 27// File revision : $Revision: 4 $
 28//
 29// $Id: sse_optimized.cpp 51 2009-01-25 14:13:39Z oparviai $
 30//
 31////////////////////////////////////////////////////////////////////////////////
 32//
 33// License :
 34//
 35//  SoundTouch audio processing library
 36//  Copyright (c) Olli Parviainen
 37//
 38//  This library is free software; you can redistribute it and/or
 39//  modify it under the terms of the GNU Lesser General Public
 40//  License as published by the Free Software Foundation; either
 41//  version 2.1 of the License, or (at your option) any later version.
 42//
 43//  This library is distributed in the hope that it will be useful,
 44//  but WITHOUT ANY WARRANTY; without even the implied warranty of
 45//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 46//  Lesser General Public License for more details.
 47//
 48//  You should have received a copy of the GNU Lesser General Public
 49//  License along with this library; if not, write to the Free Software
 50//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 51//
 52////////////////////////////////////////////////////////////////////////////////
 53
 54#include "cpu_detect.h"

 55#include "STTypes.h"

 56
 57using namespace soundtouch;
 58
 59#ifdef ALLOW_SSE

 60
 61// SSE routines available only with float sample type    
 62
 63//////////////////////////////////////////////////////////////////////////////
 64//
 65// implementation of SSE optimized functions of class 'TDStretchSSE'
 66//
 67//////////////////////////////////////////////////////////////////////////////
 68
 69#include "TDStretch.h"

 70#include <xmmintrin.h>

 71
 72// Calculates cross correlation of two buffers
 73double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
 74{
 75    int i;
 76    float *pVec1;
 77    __m128 vSum, *pVec2;
 78
 79    // Note. It means a major slow-down if the routine needs to tolerate 
 80    // unaligned __m128 memory accesses. It's way faster if we can skip 
 81    // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
 82    // This can mean up to ~ 10-fold difference (incl. part of which is
 83    // due to skipping every second round for stereo sound though).
 84    //
 85    // Compile-time define ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
 86    // for choosing if this little cheating is allowed.
 87
 88#ifdef ALLOW_NONEXACT_SIMD_OPTIMIZATION

 89    // Little cheating allowed, return valid correlation only for 
 90    // aligned locations, meaning every second round for stereo sound.
 91
 92    #define _MM_LOAD    _mm_load_ps

 93
 94    if (((ulong)pV1) & 15) return -1e50;    // skip unaligned locations
 95
 96#else

 97    // No cheating allowed, use unaligned load & take the resulting
 98    // performance hit.
 99    #define _MM_LOAD    _mm_loadu_ps

100#endif 

101
102    // ensure overlapLength is divisible by 8
103    assert((overlapLength % 8) == 0);
104
105    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
106    // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
107    pVec1 = (float*)pV1;
108    pVec2 = (__m128*)pV2;
109    vSum = _mm_setzero_ps();
110
111    // Unroll the loop by factor of 4 * 4 operations
112    for (i = 0; i < overlapLength / 8; i ++) 
113    {
114        // vSum += pV1[0..3] * pV2[0..3]
115        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0]));
116
117        // vSum += pV1[4..7] * pV2[4..7]
118        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1]));
119
120        // vSum += pV1[8..11] * pV2[8..11]
121        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2]));
122
123        // vSum += pV1[12..15] * pV2[12..15]
124        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3]));
125
126        pVec1 += 16;
127        pVec2 += 4;
128    }
129
130    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
131    float *pvSum = (float*)&vSum;
132    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
133
134    /* This is approximately corresponding routine in C-language:
135    double corr;
136    uint i;
137
138    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
139    corr = 0.0;
140    for (i = 0; i < overlapLength / 8; i ++) 
141    {
142        corr += pV1[0] * pV2[0] +
143                pV1[1] * pV2[1] +
144                pV1[2] * pV2[2] +
145                pV1[3] * pV2[3] +
146                pV1[4] * pV2[4] +
147                pV1[5] * pV2[5] +
148                pV1[6] * pV2[6] +
149                pV1[7] * pV2[7] +
150                pV1[8] * pV2[8] +
151                pV1[9] * pV2[9] +
152                pV1[10] * pV2[10] +
153                pV1[11] * pV2[11] +
154                pV1[12] * pV2[12] +
155                pV1[13] * pV2[13] +
156                pV1[14] * pV2[14] +
157                pV1[15] * pV2[15];
158
159        pV1 += 16;
160        pV2 += 16;
161    }
162    */
163
164    /* This is corresponding routine in assembler. This may be teeny-weeny bit faster
165       than intrinsic version, but more difficult to maintain & get compiled on multiple
166       platforms.
167
168    uint overlapLengthLocal = overlapLength;
169    float corr;
170
171    _asm 
172    {
173        // Very important note: data in 'pV2' _must_ be aligned to 
174        // 16-byte boundary!
175
176        // give prefetch hints to CPU of what data are to be needed soonish
177        // give more aggressive hints on pV1 as that changes while pV2 stays
178        // same between runs
179        prefetcht0 [pV1]
180        prefetcht0 [pV2]
181        prefetcht0 [pV1 + 32]
182
183        mov     eax, dword ptr pV1
184        mov     ebx, dword ptr pV2
185
186        xorps   xmm0, xmm0
187
188        mov     ecx, overlapLengthLocal
189        shr     ecx, 3  // div by eight
190
191    loop1:
192        prefetcht0 [eax + 64]     // give a prefetch hint to CPU what data are to be needed soonish
193        prefetcht0 [ebx + 32]     // give a prefetch hint to CPU what data are to be needed soonish
194        movups  xmm1, [eax]
195        mulps   xmm1, [ebx]
196        addps   xmm0, xmm1
197
198        movups  xmm2, [eax + 16]
199        mulps   xmm2, [ebx + 16]
200        addps   xmm0, xmm2
201
202        prefetcht0 [eax + 96]     // give a prefetch hint to CPU what data are to be needed soonish
203        prefetcht0 [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
204
205        movups  xmm3, [eax + 32]
206        mulps   xmm3, [ebx + 32]
207        addps   xmm0, xmm3
208
209        movups  xmm4, [eax + 48]
210        mulps   xmm4, [ebx + 48]
211        addps   xmm0, xmm4
212
213        add     eax, 64
214        add     ebx, 64
215
216        dec     ecx
217        jnz     loop1
218
219        // add the four floats of xmm0 together and return the result. 
220
221        movhlps xmm1, xmm0          // move 3 & 4 of xmm0 to 1 & 2 of xmm1
222        addps   xmm1, xmm0
223        movaps  xmm2, xmm1
224        shufps  xmm2, xmm2, 0x01    // move 2 of xmm2 as 1 of xmm2
225        addss   xmm2, xmm1
226        movss   corr, xmm2
227    }
228
229    return (double)corr;
230    */
231}
232
233
234//////////////////////////////////////////////////////////////////////////////
235//
236// implementation of SSE optimized functions of class 'FIRFilter'
237//
238//////////////////////////////////////////////////////////////////////////////
239
240#include "FIRFilter.h"

241
242FIRFilterSSE::FIRFilterSSE() : FIRFilter()
243{
244    filterCoeffsAlign = NULL;
245    filterCoeffsUnalign = NULL;
246}
247
248
249FIRFilterSSE::~FIRFilterSSE()
250{
251    delete[] filterCoeffsUnalign;
252    filterCoeffsAlign = NULL;
253    filterCoeffsUnalign = NULL;
254}
255
256
257// (overloaded) Calculates filter coefficients for SSE routine
258void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
259{
260    uint i;
261    float fDivider;
262
263    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
264
265    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
266    // also rearrange coefficients suitably for 3DNow!
267    // Ensure that filter coeffs array is aligned to 16-byte boundary
268    delete[] filterCoeffsUnalign;
269    filterCoeffsUnalign = new float[2 * newLength + 4];
270    filterCoeffsAlign = (float *)(((unsigned long)filterCoeffsUnalign + 15) & (ulong)-16);
271
272    fDivider = (float)resultDivider;
273
274    // rearrange the filter coefficients for mmx routines 
275    for (i = 0; i < newLength; i ++)
276    {
277        filterCoeffsAlign[2 * i + 0] =
278        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
279    }
280}
281
282
283
284// SSE-optimized version of the filter routine for stereo sound
285uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
286{
287    int count = (int)((numSamples - length) & (uint)-2);
288    int j;
289
290    assert(count % 2 == 0);
291
292    if (count < 2) return 0;
293
294    assert(source != NULL);
295    assert(dest != NULL);
296    assert((length % 8) == 0);
297    assert(filterCoeffsAlign != NULL);
298    assert(((ulong)filterCoeffsAlign) % 16 == 0);
299
300    // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
301    for (j = 0; j < count; j += 2)
302    {
303        float *pSrc;
304        const __m128 *pFil;
305        __m128 sum1, sum2;
306        uint i;
307
308        pSrc = (float*)source;              // source audio data
309        pFil = (__m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
310                                            // are aligned to 16-byte boundary
311        sum1 = sum2 = _mm_setzero_ps();
312
313        for (i = 0; i < length / 8; i ++) 
314        {
315            // Unroll loop for efficiency & calculate filter for 2*2 stereo samples 
316            // at each pass
317
318            // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
319            // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
320
321            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc)    , pFil[0]));
322            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
323
324            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
325            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
326
327            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) ,  pFil[2]));
328            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
329
330            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
331            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
332
333            pSrc += 16;
334            pFil += 4;
335        }
336
337        // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
338        // to sum the two hi- and lo-floats of these registers together.
339
340        // post-shuffle & add the filtered values and store to dest.
341        _mm_storeu_ps(dest, _mm_add_ps(
342                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)),   // s2_1 s2_0 s1_3 s1_2
343                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0))    // s2_3 s2_2 s1_1 s1_0
344                    ));
345        source += 4;
346        dest += 4;
347    }
348
349    // Ideas for further improvement:
350    // 1. If it could be guaranteed that 'source' were always aligned to 16-byte 
351    //    boundary, a faster aligned '_mm_load_ps' instruction could be used.
352    // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte 
353    //    boundary, a faster '_mm_store_ps' instruction could be used.
354
355    return (uint)count;
356
357    /* original routine in C-language. please notice the C-version has differently 
358       organized coefficients though.
359    double suml1, suml2;
360    double sumr1, sumr2;
361    uint i, j;
362
363    for (j = 0; j < count; j += 2)
364    {
365        const float *ptr;
366        const float *pFil;
367
368        suml1 = sumr1 = 0.0;
369        suml2 = sumr2 = 0.0;
370        ptr = src;
371        pFil = filterCoeffs;
372        for (i = 0; i < lengthLocal; i ++) 
373        {
374            // unroll loop for efficiency.
375
376            suml1 += ptr[0] * pFil[0] + 
377                     ptr[2] * pFil[2] +
378                     ptr[4] * pFil[4] +
379                     ptr[6] * pFil[6];
380
381            sumr1 += ptr[1] * pFil[1] + 
382                     ptr[3] * pFil[3] +
383                     ptr[5] * pFil[5] +
384                     ptr[7] * pFil[7];
385
386            suml2 += ptr[8] * pFil[0] + 
387                     ptr[10] * pFil[2] +
388                     ptr[12] * pFil[4] +
389                     ptr[14] * pFil[6];
390
391            sumr2 += ptr[9] * pFil[1] + 
392                     ptr[11] * pFil[3] +
393                     ptr[13] * pFil[5] +
394                     ptr[15] * pFil[7];
395
396            ptr += 16;
397            pFil += 8;
398        }
399        dest[0] = (float)suml1;
400        dest[1] = (float)sumr1;
401        dest[2] = (float)suml2;
402        dest[3] = (float)sumr2;
403
404        src += 4;
405        dest += 4;
406    }
407    */
408
409
410    /* Similar routine in assembly, again obsoleted due to maintainability
411    _asm
412    {
413        // Very important note: data in 'src' _must_ be aligned to 
414        // 16-byte boundary!
415        mov     edx, count
416        mov     ebx, dword ptr src
417        mov     eax, dword ptr dest
418        shr     edx, 1
419
420    loop1:
421        // "outer loop" : during each round 2*2 output samples are calculated
422
423        // give prefetch hints to CPU of what data are to be needed soonish
424        prefetcht0 [ebx]
425        prefetcht0 [filterCoeffsLocal]
426
427        mov     esi, ebx
428        mov     edi, filterCoeffsLocal
429        xorps   xmm0, xmm0
430        xorps   xmm1, xmm1
431        mov     ecx, lengthLocal
432
433    loop2:
434        // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
435        prefetcht0 [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
436        prefetcht0 [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
437
438        movups  xmm2, [esi]         // possibly unaligned load
439        movups  xmm3, [esi + 8]     // possibly unaligned load
440        mulps   xmm2, [edi]
441        mulps   xmm3, [edi]
442        addps   xmm0, xmm2
443        addps   xmm1, xmm3
444
445        movups  xmm4, [esi + 16]    // possibly unaligned load
446        movups  xmm5, [esi + 24]    // possibly unaligned load
447        mulps   xmm4, [edi + 16]
448        mulps   xmm5, [edi + 16]
449        addps   xmm0, xmm4
450        addps   xmm1, xmm5
451
452        prefetcht0 [esi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
453        prefetcht0 [edi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
454
455        movups  xmm6, [esi + 32]    // possibly unaligned load
456        movups  xmm7, [esi + 40]    // possibly unaligned load
457        mulps   xmm6, [edi + 32]
458        mulps   xmm7, [edi + 32]
459        addps   xmm0, xmm6
460        addps   xmm1, xmm7
461
462        movups  xmm4, [esi + 48]    // possibly unaligned load
463        movups  xmm5, [esi + 56]    // possibly unaligned load
464        mulps   xmm4, [edi + 48]
465        mulps   xmm5, [edi + 48]
466        addps   xmm0, xmm4
467        addps   xmm1, xmm5
468
469        add     esi, 64
470        add     edi, 64
471        dec     ecx
472        jnz     loop2
473
474        // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
475        // to sum the two hi- and lo-floats of these registers together.
476
477        movhlps xmm2, xmm0          // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
478        movlhps xmm2, xmm1          // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
479        shufps  xmm0, xmm1, 0xe4    // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
480        addps   xmm0, xmm2
481
482        movaps  [eax], xmm0
483        add     ebx, 16
484        add     eax, 16
485
486        dec     edx
487        jnz     loop1
488    }
489    */
490}
491
492#endif  // ALLOW_SSE