[MPlayer-dev-eng] [PATCH]: af_resample SIMD optimization (MMX/SSE2/3DNOW!/SSE/SSE3)

Zhou Zongyi zhouzongyi at pset.suntec.net
Thu Mar 12 07:21:44 CET 2009


Hi all, 

This patch implements SIMD optimized FIR filter in resample. 

Since SSE3 is used, you should apply SSE3 detection patch before applying this one: 

http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/2009-January/059824.html


Index: libaf/af_resample.c 
=================================================================== 
--- libaf/af_resample.c (revision 28929) 
+++ libaf/af_resample.c (working copy) 
@@ -24,6 +24,8 @@ 
 #include  
 #include  
  
+#include "config.h" 
+#include "cpudetect.h" 
 #include "libavutil/common.h" 
 #include "libavutil/mathematics.h" 
 #include "af.h" 
@@ -51,6 +53,16 @@ 
 #define RSMP_FLOAT (2<<0) // 32 bit floating point 
 #define RSMP_MASK (3<<0) 
  
+#if HAVE_MMX 
+#define RSMP_MMX     (1<<3) 
+#define RSMP_SSE2    (2<<3) 
+//#define RSMP_SSSE3   (3<<3) 
+#define RSMP_SSE     (4<<3) 
+#define RSMP_SSE3    (5<<3) 
+#define RSMP_3DNOW   (6<<3) 
+#define RSMP_CPUCAPS (7<<3) 
+#endif 
+ 
 // Defines for sloppy or exact resampling 
 #define FREQ_SLOPPY  (0<<2) 
 #define FREQ_EXACT   (1<<2) 
@@ -166,6 +178,16 @@ 
   data->format = af->data->format; 
   data->bps = af->data->bps; 
   af->data->nch = data->nch; 
+#if HAVE_MMX 
+  if ((s->setup & RSMP_INT) && !(s->setup & RSMP_CPUCAPS)){ 
+   s->setup |= (gCpuCaps.hasSSE2?RSMP_SSE2:RSMP_MMX); 
+    af_msg(AF_MSG_INFO,"[resample] Using %s optimized resampler\n",gCpuCaps.hasSSE2?"SSE2":"MMX"); 
+  } 
+  else if ((s->setup & RSMP_FLOAT) && !(s->setup & RSMP_CPUCAPS)){ 
+   s->setup |= (gCpuCaps.hasSSE3?RSMP_SSE3:(gCpuCaps.hasSSE?RSMP_SSE:(gCpuCaps.has3DNow?RSMP_3DNOW:0))); 
+    af_msg(AF_MSG_INFO,"[resample] Using %s optimized resampler\n",gCpuCaps.hasSSE3?"SSE3":(gCpuCaps.hasSSE?"SSE":(gCpuCaps.has3DNow?"3DNow!":"non"))); 
+  } 
+#endif 
   return rv; 
 } 
  
@@ -329,9 +351,9 @@ 
     return NULL; 
  
   // Run resampling 
+#if !(HAVE_MMX) 
   switch(s->setup & RSMP_MASK){ 
   case(RSMP_INT): 
-# define FORMAT_I 1 
     if(s->up>s->dn){ 
 #     define UP 
 #     include "af_resample_template.c" 
@@ -357,6 +379,94 @@ 
 #     undef DN 
     } 
     break; 
+#else 
+  switch(s->setup & (RSMP_MASK | RSMP_CPUCAPS)){ 
+# define FORMAT_I 1 
+  case(RSMP_INT | RSMP_MMX): 
+    if(s->up>s->dn){ 
+#     define UP 
+#     include "af_resample_template.c" 
+#     undef UP 
+    } 
+    else{ 
+#     define DN 
+#     include "af_resample_template.c" 
+#     undef DN 
+    } 
+    __asm__ volatile("emms \n\t"); 
+    break; 
+  case(RSMP_INT | RSMP_SSE2): 
+#   define FIRSSE2 
+    if(s->up>s->dn){ 
+#     define UP 
+#     include "af_resample_template.c" 
+#     undef UP 
+    } 
+    else{ 
+#     define DN 
+#     include "af_resample_template.c" 
+#     undef DN 
+    } 
+#   undef FIRSSE2 
+    break; 
+# undef FORMAT_I 
+# define FORMAT_F 1 
+  case(RSMP_FLOAT | RSMP_SSE3): 
+#   define FIRSSE3 
+    if(s->up>s->dn){ 
+#     define UP 
+#     include "af_resample_template.c" 
+#     undef UP 
+    } 
+    else{ 
+#     define DN 
+#     include "af_resample_template.c" 
+#     undef DN 
+    } 
+#   undef FIRSSE3 
+    break; 
+  case(RSMP_FLOAT | RSMP_SSE): 
+#   define FIRSSE 
+    if(s->up>s->dn){ 
+#     define UP 
+#     include "af_resample_template.c" 
+#     undef UP 
+    } 
+    else{ 
+#     define DN 
+#     include "af_resample_template.c" 
+#     undef DN 
+    } 
+#   undef FIRSSE 
+    break; 
+  case(RSMP_FLOAT | RSMP_3DNOW): 
+#   define FIR3DNOW 
+    if(s->up>s->dn){ 
+#     define UP 
+#     include "af_resample_template.c" 
+#     undef UP 
+    } 
+    else{ 
+#     define DN 
+#     include "af_resample_template.c" 
+#     undef DN 
+    } 
+    __asm__ volatile("emms \n\t"); 
+#   undef FIR3DNOW 
+    break; 
+  case(RSMP_FLOAT): 
+    if(s->up>s->dn){ 
+#     define UP 
+#     include "af_resample_template.c" 
+#     undef UP 
+    } 
+    else{ 
+#     define DN 
+#     include "af_resample_template.c" 
+#     undef DN 
+    } 
+    break; 
+#endif 
   case(RSMP_LIN): 
     len = linint(c, l, s); 
     break; 
Index: libaf/af_resample_template.c 
=================================================================== 
--- libaf/af_resample_template.c (revision 28929) 
+++ libaf/af_resample_template.c (working copy) 
@@ -65,13 +65,157 @@ 
 #else  /* L8/L16 */ 
  
 #define L    16 
+#ifdef FORMAT_I 
+#ifdef FIRSSE2 
+#define FIR(x,w,y) \ 
+   __asm__(\ 
+       "movups   (%%"REG_d"), %%xmm0 \n\t"\ 
+       "movups   (%%"REG_c"), %%xmm1 \n\t"\ 
+       "movups 16(%%"REG_d"), %%xmm2 \n\t"\ 
+       "movups 16(%%"REG_c"), %%xmm3 \n\t"\ 
+       "pmaddwd   %%xmm1, %%xmm0 \n\t"\ 
+       "pmaddwd   %%xmm3, %%xmm2 \n\t"\ 
+       "paddd     %%xmm2, %%xmm0 \n\t"\ 
+       "movhlps   %%xmm0, %%xmm1 \n\t"\ 
+       "paddd     %%xmm1, %%xmm0 \n\t"\ 
+       "pshufd     $0x01, %%xmm0, %%xmm1 \n\t"\ 
+       "paddd     %%xmm1, %%xmm0 \n\t"\ 
+       "movd      %%xmm0, %%ebx \n\t"\ 
+       "shrl         $16, %%ebx \n\t"\ 
+       "movw        %%bx, (%%"REG_a") \n\t"\ 
+       :\ 
+       : "a"(y),"d"(w),"c"(x)\ 
+       : "%ebx"\ 
+   ) 
+#else 
+#define FIR(x,w,y) \ 
+   __asm__(\ 
+       "movq   (%%"REG_d"), %%mm0 \n\t"\ 
+       "movq   (%%"REG_c"), %%mm1 \n\t"\ 
+       "movq  8(%%"REG_d"), %%mm2 \n\t"\ 
+       "movq  8(%%"REG_c"), %%mm3 \n\t"\ 
+       "pmaddwd   %%mm1, %%mm0 \n\t"\ 
+       "pmaddwd   %%mm3, %%mm2 \n\t"\ 
+       "movq 16(%%"REG_d"), %%mm4 \n\t"\ 
+       "movq 16(%%"REG_c"), %%mm5 \n\t"\ 
+       "movq 24(%%"REG_d"), %%mm6 \n\t"\ 
+       "movq 24(%%"REG_c"), %%mm7 \n\t"\ 
+       "pmaddwd   %%mm5, %%mm4 \n\t"\ 
+       "pmaddwd   %%mm7, %%mm6 \n\t"\ 
+       "paddd     %%mm2, %%mm0 \n\t"\ 
+       "paddd     %%mm6, %%mm4 \n\t"\ 
+       "paddd     %%mm4, %%mm0 \n\t"\ 
+       "movq      %%mm0, %%mm1 \n\t"\ 
+       "punpckhdq %%mm0, %%mm0 \n\t"\ 
+       "paddd     %%mm1, %%mm0 \n\t"\ 
+       "movd      %%mm0, %%ebx \n\t"\ 
+       "shrl        $16, %%ebx \n\t"\ 
+       "movw       %%bx, (%%"REG_a") \n\t"\ 
+       :\ 
+       : "a"(y),"d"(w),"c"(x)\ 
+       : "%ebx"\ 
+   ) 
+#endif // FIRSSE2 
+#else // FORMAT_I 
+#ifdef FIRSSE 
+#define FIR(x,w,y) \ 
+   __asm__(\ 
+       "movups   (%%"REG_d"), %%xmm0 \n\t"\ 
+       "movups   (%%"REG_c"), %%xmm1 \n\t"\ 
+       "movups 16(%%"REG_d"), %%xmm2 \n\t"\ 
+       "movups 16(%%"REG_c"), %%xmm3 \n\t"\ 
+       "mulps     %%xmm1, %%xmm0 \n\t"\ 
+       "mulps     %%xmm3, %%xmm2 \n\t"\ 
+       "movups 32(%%"REG_d"), %%xmm4 \n\t"\ 
+       "movups 32(%%"REG_c"), %%xmm5 \n\t"\ 
+       "addps     %%xmm2, %%xmm0 \n\t"\ 
+       "movups 48(%%"REG_d"), %%xmm6 \n\t"\ 
+       "movups 48(%%"REG_c"), %%xmm7 \n\t"\ 
+       "mulps     %%xmm5, %%xmm4 \n\t"\ 
+       "mulps     %%xmm7, %%xmm6 \n\t"\ 
+       "addps     %%xmm6, %%xmm4 \n\t"\ 
+       "addps     %%xmm4, %%xmm0 \n\t"\ 
+       "movhlps   %%xmm0, %%xmm1 \n\t"\ 
+       "addps     %%xmm1, %%xmm0 \n\t"\ 
+       "movaps    %%xmm0, %%xmm1 \n\t"\ 
+       "shufps     $0x01, %%xmm0, %%xmm0 \n\t"\ 
+       "addps     %%xmm1, %%xmm0 \n\t"\ 
+       "movss     %%xmm0, (%%"REG_a") \n\t"\ 
+       :\ 
+       : "a"(y),"d"(w),"c"(x)\ 
+   ) 
+#elif defined (FIRSSE3) 
+#define FIR(x,w,y) \ 
+   __asm__(\ 
+       "lddqu   (%%"REG_d"), %%xmm0 \n\t"\ 
+       "lddqu   (%%"REG_c"), %%xmm1 \n\t"\ 
+       "lddqu 16(%%"REG_d"), %%xmm2 \n\t"\ 
+       "lddqu 16(%%"REG_c"), %%xmm3 \n\t"\ 
+       "mulps    %%xmm1, %%xmm0 \n\t"\ 
+       "mulps    %%xmm3, %%xmm2 \n\t"\ 
+       "lddqu 32(%%"REG_d"), %%xmm4 \n\t"\ 
+       "lddqu 32(%%"REG_c"), %%xmm5 \n\t"\ 
+       "addps    %%xmm2, %%xmm0 \n\t"\ 
+       "lddqu 48(%%"REG_d"), %%xmm6 \n\t"\ 
+       "lddqu 48(%%"REG_c"), %%xmm7 \n\t"\ 
+       "mulps    %%xmm5, %%xmm4 \n\t"\ 
+       "mulps    %%xmm7, %%xmm6 \n\t"\ 
+       "addps    %%xmm6, %%xmm4 \n\t"\ 
+       "addps    %%xmm4, %%xmm0 \n\t"\ 
+       "haddps   %%xmm0, %%xmm0 \n\t"\ 
+       "haddps   %%xmm0, %%xmm0 \n\t"\ 
+       "movss    %%xmm0, (%%"REG_a") \n\t"\ 
+       :\ 
+       : "a"(y),"d"(w),"c"(x)\ 
+   ) 
+#elif defined(FIR3DNOW) 
+#define FIR(x,w,y)\ 
+   __asm__(\ 
+       "movq   (%%"REG_d"), %%mm0 \n\t"\ 
+       "movq   (%%"REG_c"), %%mm1 \n\t"\ 
+       "movq  8(%%"REG_d"), %%mm2 \n\t"\ 
+       "movq  8(%%"REG_c"), %%mm3 \n\t"\ 
+       "pfmul   %%mm1, %%mm0 \n\t"\ 
+       "pfmul   %%mm3, %%mm2 \n\t"\ 
+       "movq 16(%%"REG_d"), %%mm4 \n\t"\ 
+       "movq 16(%%"REG_c"), %%mm5 \n\t"\ 
+       "pfadd   %%mm2, %%mm0 \n\t"\ 
+       "movq 24(%%"REG_d"), %%mm6 \n\t"\ 
+       "movq 24(%%"REG_c"), %%mm7 \n\t"\ 
+       "pfmul   %%mm5, %%mm4 \n\t"\ 
+       "pfmul   %%mm7, %%mm6 \n\t"\ 
+       "movq 32(%%"REG_d"), %%mm1 \n\t"\ 
+       "movq 32(%%"REG_c"), %%mm2 \n\t"\ 
+       "pfadd   %%mm6, %%mm4 \n\t"\ 
+       "pfmul   %%mm2, %%mm1 \n\t"\ 
+       "movq 40(%%"REG_d"), %%mm2 \n\t"\ 
+       "movq 40(%%"REG_c"), %%mm3 \n\t"\ 
+       "pfadd   %%mm4, %%mm0 \n\t"\ 
+       "pfmul   %%mm3, %%mm2 \n\t"\ 
+       "movq 48(%%"REG_d"), %%mm4 \n\t"\ 
+       "movq 48(%%"REG_c"), %%mm5 \n\t"\ 
+       "pfadd   %%mm2, %%mm1 \n\t"\ 
+       "pfmul   %%mm5, %%mm4 \n\t"\ 
+       "movq 56(%%"REG_d"), %%mm6 \n\t"\ 
+       "movq 56(%%"REG_c"), %%mm7 \n\t"\ 
+       "pfadd   %%mm1, %%mm0 \n\t"\ 
+       "pfmul   %%mm7, %%mm6 \n\t"\ 
+       "pfadd   %%mm6, %%mm4 \n\t"\ 
+       "pfadd   %%mm4, %%mm0 \n\t"\ 
+       "pfacc   %%mm0, %%mm0 \n\t"\ 
+       "movd    %%mm0, (%%"REG_a") \n\t"\ 
+       :\ 
+       : "a"(y),"d"(w),"c"(x)\ 
+   ) 
+#else // FIRSSE 
 // Unrolled loop to speed up execution  
 #define FIR(x,w,y) \ 
   y[0] = ( w[0] *x[0] +w[1] *x[1] +w[2] *x[2] +w[3] *x[3] \ 
          + w[4] *x[4] +w[5] *x[5] +w[6] *x[6] +w[7] *x[7] \ 
          + w[8] *x[8] +w[9] *x[9] +w[10]*x[10]+w[11]*x[11] \ 
          + w[12]*x[12]+w[13]*x[13]+w[14]*x[14]+w[15]*x[15] ) SHIFT 
- 
+#endif // FIRSSE 
+#endif // FORMAT_I 
 #endif /* L8/L16 */ 
  
 // Macro to add data to circular que 



Regards,

Zhou Zongyi


More information about the MPlayer-dev-eng mailing list