media/kiss_fft/README.simd

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/kiss_fft/README.simd	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,78 @@
     1.4 +If you are reading this, it means you think you may be interested in using the SIMD extensions in kissfft 
     1.5 +to do 4 *separate* FFTs at once.
     1.6 +
     1.7 +Beware! Beyond here there be dragons!
     1.8 +
     1.9 +This API is not easy to use, is not well documented, and breaks the KISS principle.  
    1.10 +
    1.11 +
    1.12 +Still reading? Okay, you may get rewarded for your patience with a considerable speedup 
    1.13 +(2-3x) on intel x86 machines with SSE if you are willing to jump through some hoops.
    1.14 +
    1.15 +The basic idea is to use the packed 4 float __m128 data type as a scalar element.  
    1.16 +This means that the format is pretty convoluted. It performs 4 FFTs per fft call on signals A,B,C,D.
    1.17 +
    1.18 +For complex data, the data is interlaced as follows:
    1.19 +rA0,rB0,rC0,rD0,      iA0,iB0,iC0,iD0,   rA1,rB1,rC1,rD1, iA1,iB1,iC1,iD1 ...
    1.20 +where "rA0" is the real part of the zeroth sample for signal A
    1.21 +
    1.22 +Real-only data is laid out:
    1.23 +rA0,rB0,rC0,rD0,     rA1,rB1,rC1,rD1,      ... 
    1.24 +
    1.25 +Compile with gcc flags something like
    1.26 +-O3 -mpreferred-stack-boundary=4  -DUSE_SIMD=1 -msse 
    1.27 +
    1.28 +Be aware of SIMD alignment.  This is the most likely cause of segfaults.  
    1.29 +The code within kissfft uses scratch variables on the stack.  
    1.30 +With SIMD, these must have addresses on 16 byte boundaries.  
    1.31 +Search on "SIMD alignment" for more info.
    1.32 +
    1.33 +
    1.34 +
    1.35 +Robin at Divide Concept was kind enough to share his code for formatting to/from the SIMD kissfft.  
    1.36 +I have not run it -- use it at your own risk.  It appears to do 4xN and Nx4 transpositions 
    1.37 +(out of place).
    1.38 +
    1.39 +void SSETools::pack128(float* target, float* source, unsigned long size128)
    1.40 +{
    1.41 +   __m128* pDest = (__m128*)target;
    1.42 +   __m128* pDestEnd = pDest+size128;
    1.43 +   float* source0=source;
    1.44 +   float* source1=source0+size128;
    1.45 +   float* source2=source1+size128;
    1.46 +   float* source3=source2+size128;
    1.47 +
    1.48 +   while(pDest<pDestEnd)
    1.49 +   {
    1.50 +       *pDest=_mm_set_ps(*source3,*source2,*source1,*source0);
    1.51 +       source0++;
    1.52 +       source1++;
    1.53 +       source2++;
    1.54 +       source3++;
    1.55 +       pDest++;
    1.56 +   }
    1.57 +}
    1.58 +
    1.59 +void SSETools::unpack128(float* target, float* source, unsigned long size128)
    1.60 +{
    1.61 +
    1.62 +   float* pSrc = source;
    1.63 +   float* pSrcEnd = pSrc+size128*4;
    1.64 +   float* target0=target;
    1.65 +   float* target1=target0+size128;
    1.66 +   float* target2=target1+size128;
    1.67 +   float* target3=target2+size128;
    1.68 +
    1.69 +   while(pSrc<pSrcEnd)
    1.70 +   {
    1.71 +       *target0=pSrc[0];
    1.72 +       *target1=pSrc[1];
    1.73 +       *target2=pSrc[2];
    1.74 +       *target3=pSrc[3];
    1.75 +       target0++;
    1.76 +       target1++;
    1.77 +       target2++;
    1.78 +       target3++;
    1.79 +       pSrc+=4;
    1.80 +   }
    1.81 +} 

mercurial