[MPlayer-dev-eng] [PATCH]: af_hrtf SIMD optimization (3DNOW!/SSE/SSE3/plain C)

Yue Shi Lai ylai at users.sourceforge.net
Tue Jun 9 19:55:55 CEST 2009


Hi Zongyi,

Sorry for the several months delay in noticing this (since there has 
been so little comments on the HRTF filter, I simply wasn't looking 
carefully through the list).

I have several comments on this (which probably applies to other FIR 
operations in libaf, too):

1. The efficient way to implement naive (i.e. direct scalar product) FIR 
filtering is not to rely on movups/lddqu/..., but to replicate the 
coefficient for different alignments. This is also stated in one of the 
(more obscure) Intel SSE application notes.

I have a proof-of-concept implementation of this for af_hrtf, but 
because of (2) below, I wasn't satisfied with its performance. However, 
if there is interest, I can put together a patch and submit it to this list.

2. Direct/naive FIR is probably not the way to go for long FIR filters, 
but rather Karatsuba or FFT-based. Karatsuba method is faster than FFT 
for intermediate lengths (~ 512) and reasonably easy to implement 
without dependency on a large library (like FFTW). I am working on a 
Karatsuba method (with O(N^(log(3)/log(2))) complexity) for the af_hrtf 
filter.

Unfortunately, the Karatsuba method is obscure enough that little 
reference code for non-power-of-2 and nonequal kernel/signal lengths is 
out there, and e.g. GStreamer only implements it for power-of-2, equal 
length convolution, making it not suitable for af_hrtf.

Maybe I should also mention that I am currently also working on a 
revision of the HRTF that takes advantage of current processor 
performance in simulating room acoustics, thus the interest in efficient 
long filter length FIR filtering.

Best,

Yue Shi Lai

Zhou Zongyi schrieb:
> Hi all, 
> 
> This patch implements SIMD optimized FIR filter used by hrtf. They can also be used by other audio filters.
> 
> Since SSE3 is used, you should apply SSE3 detection patch before applying this one: 
> 
> http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/2009-January/059824.html
> 
> 
> Index: libaf/af_hrtf.c
> ===================================================================
> --- libaf/af_hrtf.c (revision 28934)
> +++ libaf/af_hrtf.c (working copy)
> @@ -27,6 +27,7 @@
>  
>  #include <math.h>
>  
> +#include "cpudetect.h"
>  #include "af.h"
>  #include "dsp.h"
>  
> @@ -77,19 +78,40 @@
>   *    sk: convolution kernel
>   *    offset: offset on the ring buffer, can be 
>   */
> -static float conv(const int nx, const int nk, const float *sx, const float *sk,
> -    const int offset)
> -{
> -    /* k = reminder of offset / nx */
> -    int k = offset >= 0 ? offset % nx : nx + (offset % nx);
>  
> -    if(nk + k <= nx)
> - return af_filter_fir(nk, sx + k, sk);
> -    else
> - return af_filter_fir(nk + k - nx, sx, sk + nx - k) +
> -     af_filter_fir(nx - k, sx + k, sk);
> +#define conv_routine(x)\
> +static float x(const int nx, const int nk, const float *sx, const float *sk, const int offset)\
> +{\
> +    /* k = reminder of offset / nx */\
> +    int k = offset >= 0 ? offset % nx : nx + (offset % nx);\
> +    if(nk + k <= nx)\
> + return af_filter_fir(nk, sx + k, sk);\
> +    else\
> + return af_filter_fir(nk + k - nx, sx, sk + nx - k) +\
> +     af_filter_fir(nx - k, sx + k, sk);\
>  }
>  
> +
> +#if HAVE_MMX
> +conv_routine(conv_c)
> +
> +#define af_filter_fir af_filter_fir_sse
> +conv_routine(conv_sse)
> +#undef af_filter_fir
> +
> +#define af_filter_fir af_filter_fir_sse3
> +conv_routine(conv_sse3)
> +#undef af_filter_fir
> +
> +#define af_filter_fir af_filter_fir_3dnow
> +conv_routine(conv_3dnow)
> +#undef af_filter_fir
> +
> +static float (*conv)(const int, const int, const float*, const float*, const int) = NULL;
> +#else
> +conv_routine(conv)
> +#endif
> +
>  /* Detect when the impulse response starts (significantly) */
>  static int pulse_detect(const float *sx)
>  {
> @@ -107,14 +129,8 @@
>  
>  /* Fuzzy matrix coefficient transfer function to "lock" the matrix on
>     a effectively passive mode if the gain is approximately 1 */
> -static inline float passive_lock(float x)
> -{
> -   const float x1 = x - 1;
> -   const float ax1s = fabs(x - 1) * (1.0 / MATAGCLOCK);
> +#define passive_lock(x) ((float)((x-1)-(x-1)/(1+(x-1)*(x-1)*(1/MATAGCLOCK/MATAGCLOCK))+1))
>  
> -   return x1 - x1 / (1 + ax1s * ax1s) + 1;
> -}
> -
>  /* Unified active matrix decoder for 2 channel matrix encoded surround
>     sources */
>  static inline void matrix_decode(short *in, const int k, const int il,
> @@ -668,6 +684,15 @@
>      for(i = 0; i < s->basslen; i++)
>   s->ba_ir[i] *= BASSGAIN;
>      
> +#if HAVE_MMX
> +    if (!conv){
> +     conv =  gCpuCaps.hasSSE3 ? conv_sse3 
> +           :(gCpuCaps.hasSSE  ? conv_sse
> +           :(gCpuCaps.has3DNow? conv_3dnow
> +                              : conv_c));
> +     af_msg(AF_MSG_INFO,"[hrtf] Using %s optimized converter\n",gCpuCaps.hasSSE3?"SSE3":(gCpuCaps.hasSSE?"SSE":(gCpuCaps.has3DNow? "3DNow!":"non")));
> +    }
> +#endif
>      return AF_OK;
>  }
>  
> Index: libaf/af_hrtf.h
> ===================================================================
> --- libaf/af_hrtf.h (revision 28934)
> +++ libaf/af_hrtf.h (working copy)
> @@ -106,7 +106,7 @@
>  
>  /* Center front (-5 degree) - not 0 degree in order to create a clear
>     front image from a finite distance */
> -static const float cf_filt[128] = {
> +static const float cf_filt[128] __attribute__((aligned(16))) = {
>     -0.00008638082319075036, 0.0003198059946385229,
>     -0.0005010631339162132, 0.0011424741331126876,
>     -0.001584220794688753, 0.001742715363246275,
> @@ -173,7 +173,7 @@
>     -0.005469203016468759, -0.004355784273189485
>  };
>  /* Front,   same side (30 degree) */
> -static const float af_filt[128] = {
> +static const float af_filt[128] __attribute__((aligned(16))) = {
>     -0.004140580614755493, 0.005790934614385445,
>     0.003318916682081112, 0.014257145544366063,
>     0.007328442487127339, -0.06550381777876194,
> @@ -240,7 +240,7 @@
>     -0.0005083025643192044, 0.00035096963769606926
>  };
>  /* Front,   opposite (-30 degree) */
> -static const float of_filt[128] = {
> +static const float of_filt[128] __attribute__((aligned(16))) = {
>     -0.000013472538374193126, -0.00008048061877079751,
>     0.000043927265781258155, -0.000017931700794858892,
>     -0.000034774602476112886, -0.00009576223008735474,
> @@ -307,7 +307,7 @@
>     -0.0013726264946164232, -0.0020075119315034313
>  };
>  /* Rear,   same side (135 degree) */
> -static const float ar_filt[128] = {
> +static const float ar_filt[128] __attribute__((aligned(16))) = {
>     0.004573315040810066, 0.0013592578059426913,
>     0.01553271930902704, -0.0002356117224941317,
>     -0.05746098219774702, 0.03430688963370445,
> @@ -374,7 +374,7 @@
>     -0.0026884314856593368, -0.004084181815125924
>  };
>  /* Rear,   opposite (-135 degree) */
> -static const float or_filt[128] = {
> +static const float or_filt[128] __attribute__((aligned(16))) = {
>     0.0001220944028243897, -0.000021785381808441314,
>     5.823057988603169e-6, -0.00001217768176447613,
>     -0.00006123604397345513, 5.574117262531134e-6,
> @@ -441,7 +441,7 @@
>     -0.001852908777923165, -0.002540339553144362
>  };
>  /* Center rear (180 degree) */
> -static const float cr_filt[128] = {
> +static const float cr_filt[128] __attribute__((aligned(16))) = {
>     -0.00005989110716536726, -0.00022790291829128702,
>     0.0002659166098971966, -0.0003774772716776257,
>     0.0004540309551867803, -0.000420238187386368,
> Index: libaf/filter.c
> ===================================================================
> --- libaf/filter.c (revision 28934)
> +++ libaf/filter.c (working copy)
> @@ -22,6 +22,8 @@
>  
>  #include <string.h>
>  #include <math.h>
> +#include "config.h"
> +#include "cpudetect.h"
>  #include "dsp.h"
>  
>  /******************************************************************************
> @@ -39,13 +41,141 @@
>  {
>    register FLOAT_TYPE y; // Output
>    y = 0.0; 
> -  do{
> -    n--;
> -    y+=w[n]*x[n];
> -  }while(n != 0);
> +  while((int)(n-=4)>=0){
> +    y+=w[n]*x[n]+(w+1)[n]*(x+1)[n]+(w+2)[n]*(x+2)[n]+(w+3)[n]*(x+3)[n];
> +  }
> +  switch(n)
> +  {
> +   case -1:y+=w[2]*x[2];
> +   case -2:y+=w[1]*x[1];
> +   case -3:y+=w[0]*x[0];
> +  }
>    return y;
>  }
>  
> +#if HAVE_MMX
> +/* SIMD implementation of FIR filter on x86 */
> +inline FLOAT_TYPE af_filter_fir_sse(register unsigned int n, const FLOAT_TYPE* w,
> +                                    const FLOAT_TYPE* x)
> +{
> +  FLOAT_TYPE y = 0.0; // Output
> +  __asm__(
> +      "subl $8, %%"REG_c" \n\t"
> +      "jl 2f \n\t"
> +      "xorps %%xmm4, %%xmm4 \n\t"
> +      ".align (1<<4) \n\t"
> +      "1: \n\t"
> +      "movups   (%%"REG_a",%%"REG_c",4), %%xmm0 \n\t"
> +      "movups   (%%"REG_d",%%"REG_c",4), %%xmm1 \n\t"
> +      "movups 16(%%"REG_a",%%"REG_c",4), %%xmm2 \n\t"
> +      "movups 16(%%"REG_d",%%"REG_c",4), %%xmm3 \n\t"
> +      "mulps   %%xmm1, %%xmm0 \n\t"
> +      "mulps   %%xmm3, %%xmm2 \n\t"
> +      "addps   %%xmm2, %%xmm4 \n\t"
> +      "addps   %%xmm0, %%xmm4 \n\t"
> +      "subl $8, %%"REG_c" \n\t"
> +      "jge 1b \n\t"
> +      "movhlps %%xmm4, %%xmm0 \n\t"
> +      "addps   %%xmm0, %%xmm4 \n\t"
> +      "movaps  %%xmm4, %%xmm0 \n\t"
> +      "shufps   $0x01, %%xmm4, %%xmm4 \n\t"
> +      "addps   %%xmm0, %%xmm4 \n\t"
> +      "movss   %%xmm4, %4 \n\t"
> +      "2: \n\t"
> +      :"=c"(n)
> +      :"a"(w),"d"(x),"c"(n),"m"(y)
> +   :"memory"
> +  );
> +  switch(n)
> +  {
> +   case -1:y+=w[6]*x[6];
> +   case -2:y+=w[5]*x[5];
> +   case -3:y+=w[4]*x[4];
> +   case -4:y+=w[3]*x[3];
> +   case -5:y+=w[2]*x[2];
> +   case -6:y+=w[1]*x[1];
> +   case -7:y+=w[0]*x[0];
> +  }
> +  return y;
> +}
> +
> +inline FLOAT_TYPE af_filter_fir_sse3(register unsigned int n, const FLOAT_TYPE* w,
> +                                     const FLOAT_TYPE* x)
> +{
> +  register FLOAT_TYPE y = 0.0; // Output
> +  __asm__(
> +      "subl $8, %%"REG_c" \n\t"
> +      "jl 2f \n\t"
> +      "xorps  %%xmm4, %%xmm4 \n\t"
> +      ".align (1<<4) \n\t"
> +      "1: \n\t"
> +      "lddqu   (%%"REG_a",%%"REG_c",4), %%xmm0 \n\t"
> +      "lddqu   (%%"REG_d",%%"REG_c",4), %%xmm1 \n\t"
> +      "lddqu 16(%%"REG_a",%%"REG_c",4), %%xmm2 \n\t"
> +      "lddqu 16(%%"REG_d",%%"REG_c",4), %%xmm3 \n\t"
> +      "mulps  %%xmm1, %%xmm0 \n\t"
> +      "mulps  %%xmm3, %%xmm2 \n\t"
> +      "addps  %%xmm2, %%xmm4 \n\t"
> +      "addps  %%xmm0, %%xmm4 \n\t"
> +      "subl       $8, %%"REG_c" \n\t"
> +      "jge 1b \n\t"
> +      "haddps %%xmm4, %%xmm4 \n\t"
> +      "haddps %%xmm4, %%xmm4 \n\t"
> +      "movd   %%xmm4, %0 \n\t"
> +      "2: \n\t"
> +      :"=r"(y),"=c"(n)
> +      :"a"(w),"d"(x),"c"(n)
> +  );
> +  switch(n)
> +  {
> +   case -1:y+=w[6]*x[6];
> +   case -2:y+=w[5]*x[5];
> +   case -3:y+=w[4]*x[4];
> +   case -4:y+=w[3]*x[3];
> +   case -5:y+=w[2]*x[2];
> +   case -6:y+=w[1]*x[1];
> +   case -7:y+=w[0]*x[0];
> +  }
> +  return y;
> +}
> +
> +inline FLOAT_TYPE af_filter_fir_3dnow(register unsigned int n, const FLOAT_TYPE* w,
> +                                      const FLOAT_TYPE* x)
> +{
> +  register FLOAT_TYPE y = 0.0; // Output
> +  __asm__(
> +      "subl $4, %%"REG_c" \n\t"
> +      "jl 2f \n\t"
> +      "pxor  %%mm4, %%mm4 \n\t"
> +      ".align (1<<4) \n\t"
> +      "1: \n\t"
> +      "movq  (%%"REG_a",%%"REG_c",4), %%mm0 \n\t"
> +      "movq  (%%"REG_d",%%"REG_c",4), %%mm1 \n\t"
> +      "movq 8(%%"REG_a",%%"REG_c",4), %%mm2 \n\t"
> +      "movq 8(%%"REG_d",%%"REG_c",4), %%mm3 \n\t"
> +      "pfmul %%mm1, %%mm0 \n\t"
> +      "pfmul %%mm3, %%mm2 \n\t"
> +      "pfadd %%mm2, %%mm4 \n\t"
> +      "pfadd %%mm0, %%mm4 \n\t"
> +      "subl     $4, %%"REG_c" \n\t"
> +      "jge 1b \n\t"
> +      "pfacc %%mm4, %%mm4 \n\t"
> +      "movd  %%mm4, %0 \n\t"
> +      "femms \n\t"
> +      "2: \n\t"
> +      :"=r"(y),"=c"(n)
> +      :"a"(w),"d"(x),"c"(n)
> +  );
> +  switch(n)
> +  {
> +   case -1:y+=w[2]*x[2];
> +   case -2:y+=w[1]*x[1];
> +   case -3:y+=w[0]*x[0];
> +  }
> +  return y;
> +}
> +#endif
> +
>  /* C implementation of parallel FIR filter y(k)=w(k) * x(k) (where * denotes convolution)
>  
>     n  number of filter taps, where mod(n,4)==0
> Index: libaf/filter.h
> ===================================================================
> --- libaf/filter.h (revision 28934)
> +++ libaf/filter.h (working copy)
> @@ -56,6 +56,12 @@
>  // Exported functions
>  FLOAT_TYPE af_filter_fir(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
>  
> +#if HAVE_MMX
> +FLOAT_TYPE af_filter_fir_sse(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
> +FLOAT_TYPE af_filter_fir_sse3(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
> +FLOAT_TYPE af_filter_fir_3dnow(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
> +#endif
> +
>  FLOAT_TYPE* af_filter_pfir(unsigned int n, unsigned int k,
>                             unsigned int xi, const FLOAT_TYPE** w,
>                             const FLOAT_TYPE** x, FLOAT_TYPE* y,
> 
> 
> 
> Regards,
> 
> Zhou Zongyi
> _______________________________________________
> MPlayer-dev-eng mailing list
> MPlayer-dev-eng at mplayerhq.hu
> https://lists.mplayerhq.hu/mailman/listinfo/mplayer-dev-eng




More information about the MPlayer-dev-eng mailing list