[MPlayer-dev-eng] [PATCH]: af_hrtf SIMD optimization (3DNOW!/SSE/SSE3/plain C)

Thu Mar 12 07:35:58 CET 2009

Hi all, 

This patch implements SIMD optimized FIR filter used by hrtf. They can also be used by other audio filters.

Since SSE3 is used, you should apply SSE3 detection patch before applying this one: 

http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/2009-January/059824.html


Index: libaf/af_hrtf.c
===================================================================

--- libaf/af_hrtf.c (revision 28934)
+++ libaf/af_hrtf.c (working copy)
@@ -27,6 +27,7 @@
 
 #include <math.h>
 
+#include "cpudetect.h"
 #include "af.h"
 #include "dsp.h"
 
@@ -77,19 +78,40 @@
  *    sk: convolution kernel
  *    offset: offset on the ring buffer, can be 
  */
-static float conv(const int nx, const int nk, const float *sx, const float *sk,
-    const int offset)
-{
-    /* k = reminder of offset / nx */
-    int k = offset >= 0 ? offset % nx : nx + (offset % nx);
 
-    if(nk + k <= nx)
- return af_filter_fir(nk, sx + k, sk);
-    else
- return af_filter_fir(nk + k - nx, sx, sk + nx - k) +
-     af_filter_fir(nx - k, sx + k, sk);
+#define conv_routine(x)\
+static float x(const int nx, const int nk, const float *sx, const float *sk, const int offset)\
+{\
+    /* k = reminder of offset / nx */\
+    int k = offset >= 0 ? offset % nx : nx + (offset % nx);\
+    if(nk + k <= nx)\
+ return af_filter_fir(nk, sx + k, sk);\
+    else\
+ return af_filter_fir(nk + k - nx, sx, sk + nx - k) +\
+     af_filter_fir(nx - k, sx + k, sk);\
 }
 
+
+#if HAVE_MMX
+conv_routine(conv_c)
+
+#define af_filter_fir af_filter_fir_sse
+conv_routine(conv_sse)
+#undef af_filter_fir
+
+#define af_filter_fir af_filter_fir_sse3
+conv_routine(conv_sse3)
+#undef af_filter_fir
+
+#define af_filter_fir af_filter_fir_3dnow
+conv_routine(conv_3dnow)
+#undef af_filter_fir
+
+static float (*conv)(const int, const int, const float*, const float*, const int) = NULL;
+#else
+conv_routine(conv)
+#endif
+
 /* Detect when the impulse response starts (significantly) */
 static int pulse_detect(const float *sx)
 {
@@ -107,14 +129,8 @@
 
 /* Fuzzy matrix coefficient transfer function to "lock" the matrix on
    a effectively passive mode if the gain is approximately 1 */
-static inline float passive_lock(float x)
-{
-   const float x1 = x - 1;
-   const float ax1s = fabs(x - 1) * (1.0 / MATAGCLOCK);
+#define passive_lock(x) ((float)((x-1)-(x-1)/(1+(x-1)*(x-1)*(1/MATAGCLOCK/MATAGCLOCK))+1))
 
-   return x1 - x1 / (1 + ax1s * ax1s) + 1;
-}
-
 /* Unified active matrix decoder for 2 channel matrix encoded surround
    sources */
 static inline void matrix_decode(short *in, const int k, const int il,
@@ -668,6 +684,15 @@
     for(i = 0; i < s->basslen; i++)
  s->ba_ir[i] *= BASSGAIN;
     
+#if HAVE_MMX
+    if (!conv){
+     conv =  gCpuCaps.hasSSE3 ? conv_sse3 
+           :(gCpuCaps.hasSSE  ? conv_sse
+           :(gCpuCaps.has3DNow? conv_3dnow
+                              : conv_c));
+     af_msg(AF_MSG_INFO,"[hrtf] Using %s optimized converter\n",gCpuCaps.hasSSE3?"SSE3":(gCpuCaps.hasSSE?"SSE":(gCpuCaps.has3DNow? "3DNow!":"non")));
+    }
+#endif
     return AF_OK;
 }
 
Index: libaf/af_hrtf.h
===================================================================
--- libaf/af_hrtf.h (revision 28934)
+++ libaf/af_hrtf.h (working copy)
@@ -106,7 +106,7 @@
 
 /* Center front (-5 degree) - not 0 degree in order to create a clear
    front image from a finite distance */
-static const float cf_filt[128] = {
+static const float cf_filt[128] __attribute__((aligned(16))) = {
    -0.00008638082319075036, 0.0003198059946385229,
    -0.0005010631339162132, 0.0011424741331126876,
    -0.001584220794688753, 0.001742715363246275,
@@ -173,7 +173,7 @@
    -0.005469203016468759, -0.004355784273189485
 };
 /* Front,   same side (30 degree) */
-static const float af_filt[128] = {
+static const float af_filt[128] __attribute__((aligned(16))) = {
    -0.004140580614755493, 0.005790934614385445,
    0.003318916682081112, 0.014257145544366063,
    0.007328442487127339, -0.06550381777876194,
@@ -240,7 +240,7 @@
    -0.0005083025643192044, 0.00035096963769606926
 };
 /* Front,   opposite (-30 degree) */
-static const float of_filt[128] = {
+static const float of_filt[128] __attribute__((aligned(16))) = {
    -0.000013472538374193126, -0.00008048061877079751,
    0.000043927265781258155, -0.000017931700794858892,
    -0.000034774602476112886, -0.00009576223008735474,
@@ -307,7 +307,7 @@
    -0.0013726264946164232, -0.0020075119315034313
 };
 /* Rear,   same side (135 degree) */
-static const float ar_filt[128] = {
+static const float ar_filt[128] __attribute__((aligned(16))) = {
    0.004573315040810066, 0.0013592578059426913,
    0.01553271930902704, -0.0002356117224941317,
    -0.05746098219774702, 0.03430688963370445,
@@ -374,7 +374,7 @@
    -0.0026884314856593368, -0.004084181815125924
 };
 /* Rear,   opposite (-135 degree) */
-static const float or_filt[128] = {
+static const float or_filt[128] __attribute__((aligned(16))) = {
    0.0001220944028243897, -0.000021785381808441314,
    5.823057988603169e-6, -0.00001217768176447613,
    -0.00006123604397345513, 5.574117262531134e-6,
@@ -441,7 +441,7 @@
    -0.001852908777923165, -0.002540339553144362
 };
 /* Center rear (180 degree) */
-static const float cr_filt[128] = {
+static const float cr_filt[128] __attribute__((aligned(16))) = {
    -0.00005989110716536726, -0.00022790291829128702,
    0.0002659166098971966, -0.0003774772716776257,
    0.0004540309551867803, -0.000420238187386368,
Index: libaf/filter.c
===================================================================
--- libaf/filter.c (revision 28934)
+++ libaf/filter.c (working copy)
@@ -22,6 +22,8 @@
 
 #include <string.h>
 #include <math.h>
+#include "config.h"
+#include "cpudetect.h"
 #include "dsp.h"
 
 /******************************************************************************
@@ -39,13 +41,141 @@
 {
   register FLOAT_TYPE y; // Output
   y = 0.0; 
-  do{
-    n--;
-    y+=w[n]*x[n];
-  }while(n != 0);
+  while((int)(n-=4)>=0){
+    y+=w[n]*x[n]+(w+1)[n]*(x+1)[n]+(w+2)[n]*(x+2)[n]+(w+3)[n]*(x+3)[n];
+  }
+  switch(n)
+  {
+   case -1:y+=w[2]*x[2];
+   case -2:y+=w[1]*x[1];
+   case -3:y+=w[0]*x[0];
+  }
   return y;
 }
 
+#if HAVE_MMX
+/* SIMD implementation of FIR filter on x86 */
+inline FLOAT_TYPE af_filter_fir_sse(register unsigned int n, const FLOAT_TYPE* w,
+                                    const FLOAT_TYPE* x)
+{
+  FLOAT_TYPE y = 0.0; // Output
+  __asm__(
+      "subl $8, %%"REG_c" \n\t"
+      "jl 2f \n\t"
+      "xorps %%xmm4, %%xmm4 \n\t"
+      ".align (1<<4) \n\t"
+      "1: \n\t"
+      "movups   (%%"REG_a",%%"REG_c",4), %%xmm0 \n\t"
+      "movups   (%%"REG_d",%%"REG_c",4), %%xmm1 \n\t"
+      "movups 16(%%"REG_a",%%"REG_c",4), %%xmm2 \n\t"
+      "movups 16(%%"REG_d",%%"REG_c",4), %%xmm3 \n\t"
+      "mulps   %%xmm1, %%xmm0 \n\t"
+      "mulps   %%xmm3, %%xmm2 \n\t"
+      "addps   %%xmm2, %%xmm4 \n\t"
+      "addps   %%xmm0, %%xmm4 \n\t"
+      "subl $8, %%"REG_c" \n\t"
+      "jge 1b \n\t"
+      "movhlps %%xmm4, %%xmm0 \n\t"
+      "addps   %%xmm0, %%xmm4 \n\t"
+      "movaps  %%xmm4, %%xmm0 \n\t"
+      "shufps   $0x01, %%xmm4, %%xmm4 \n\t"
+      "addps   %%xmm0, %%xmm4 \n\t"
+      "movss   %%xmm4, %4 \n\t"
+      "2: \n\t"
+      :"=c"(n)
+      :"a"(w),"d"(x),"c"(n),"m"(y)
+   :"memory"
+  );
+  switch(n)
+  {
+   case -1:y+=w[6]*x[6];
+   case -2:y+=w[5]*x[5];
+   case -3:y+=w[4]*x[4];
+   case -4:y+=w[3]*x[3];
+   case -5:y+=w[2]*x[2];
+   case -6:y+=w[1]*x[1];
+   case -7:y+=w[0]*x[0];
+  }
+  return y;
+}
+
+inline FLOAT_TYPE af_filter_fir_sse3(register unsigned int n, const FLOAT_TYPE* w,
+                                     const FLOAT_TYPE* x)
+{
+  register FLOAT_TYPE y = 0.0; // Output
+  __asm__(
+      "subl $8, %%"REG_c" \n\t"
+      "jl 2f \n\t"
+      "xorps  %%xmm4, %%xmm4 \n\t"
+      ".align (1<<4) \n\t"
+      "1: \n\t"
+      "lddqu   (%%"REG_a",%%"REG_c",4), %%xmm0 \n\t"
+      "lddqu   (%%"REG_d",%%"REG_c",4), %%xmm1 \n\t"
+      "lddqu 16(%%"REG_a",%%"REG_c",4), %%xmm2 \n\t"
+      "lddqu 16(%%"REG_d",%%"REG_c",4), %%xmm3 \n\t"
+      "mulps  %%xmm1, %%xmm0 \n\t"
+      "mulps  %%xmm3, %%xmm2 \n\t"
+      "addps  %%xmm2, %%xmm4 \n\t"
+      "addps  %%xmm0, %%xmm4 \n\t"
+      "subl       $8, %%"REG_c" \n\t"
+      "jge 1b \n\t"
+      "haddps %%xmm4, %%xmm4 \n\t"
+      "haddps %%xmm4, %%xmm4 \n\t"
+      "movd   %%xmm4, %0 \n\t"
+      "2: \n\t"
+      :"=r"(y),"=c"(n)
+      :"a"(w),"d"(x),"c"(n)
+  );
+  switch(n)
+  {
+   case -1:y+=w[6]*x[6];
+   case -2:y+=w[5]*x[5];
+   case -3:y+=w[4]*x[4];
+   case -4:y+=w[3]*x[3];
+   case -5:y+=w[2]*x[2];
+   case -6:y+=w[1]*x[1];
+   case -7:y+=w[0]*x[0];
+  }
+  return y;
+}
+
+inline FLOAT_TYPE af_filter_fir_3dnow(register unsigned int n, const FLOAT_TYPE* w,
+                                      const FLOAT_TYPE* x)
+{
+  register FLOAT_TYPE y = 0.0; // Output
+  __asm__(
+      "subl $4, %%"REG_c" \n\t"
+      "jl 2f \n\t"
+      "pxor  %%mm4, %%mm4 \n\t"
+      ".align (1<<4) \n\t"
+      "1: \n\t"
+      "movq  (%%"REG_a",%%"REG_c",4), %%mm0 \n\t"
+      "movq  (%%"REG_d",%%"REG_c",4), %%mm1 \n\t"
+      "movq 8(%%"REG_a",%%"REG_c",4), %%mm2 \n\t"
+      "movq 8(%%"REG_d",%%"REG_c",4), %%mm3 \n\t"
+      "pfmul %%mm1, %%mm0 \n\t"
+      "pfmul %%mm3, %%mm2 \n\t"
+      "pfadd %%mm2, %%mm4 \n\t"
+      "pfadd %%mm0, %%mm4 \n\t"
+      "subl     $4, %%"REG_c" \n\t"
+      "jge 1b \n\t"
+      "pfacc %%mm4, %%mm4 \n\t"
+      "movd  %%mm4, %0 \n\t"
+      "femms \n\t"
+      "2: \n\t"
+      :"=r"(y),"=c"(n)
+      :"a"(w),"d"(x),"c"(n)
+  );
+  switch(n)
+  {
+   case -1:y+=w[2]*x[2];
+   case -2:y+=w[1]*x[1];
+   case -3:y+=w[0]*x[0];
+  }
+  return y;
+}
+#endif
+
 /* C implementation of parallel FIR filter y(k)=w(k) * x(k) (where * denotes convolution)
 
    n  number of filter taps, where mod(n,4)==0
Index: libaf/filter.h
===================================================================
--- libaf/filter.h (revision 28934)
+++ libaf/filter.h (working copy)
@@ -56,6 +56,12 @@
 // Exported functions
 FLOAT_TYPE af_filter_fir(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
 
+#if HAVE_MMX
+FLOAT_TYPE af_filter_fir_sse(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
+FLOAT_TYPE af_filter_fir_sse3(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
+FLOAT_TYPE af_filter_fir_3dnow(unsigned int n, const FLOAT_TYPE* w, const FLOAT_TYPE* x);
+#endif
+
 FLOAT_TYPE* af_filter_pfir(unsigned int n, unsigned int k,
                            unsigned int xi, const FLOAT_TYPE** w,
                            const FLOAT_TYPE** x, FLOAT_TYPE* y,



Regards,

Zhou Zongyi