[MPlayer-dev-eng] [PATCH]: af_resample SIMD optimization (MMX/SSE2/3DNOW!/SSE/SSE3)
Zhou Zongyi
zhouzongyi at pset.suntec.net
Thu Mar 12 07:21:44 CET 2009
Hi all,
This patch implements SIMD optimized FIR filter in resample.
Since SSE3 is used, you should apply SSE3 detection patch before applying this one:
http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/2009-January/059824.html
Index: libaf/af_resample.c
===================================================================
--- libaf/af_resample.c (revision 28929)
+++ libaf/af_resample.c (working copy)
@@ -24,6 +24,8 @@
#include
#include
+#include "config.h"
+#include "cpudetect.h"
#include "libavutil/common.h"
#include "libavutil/mathematics.h"
#include "af.h"
@@ -51,6 +53,16 @@
#define RSMP_FLOAT (2<<0) // 32 bit floating point
#define RSMP_MASK (3<<0)
+#if HAVE_MMX
+#define RSMP_MMX (1<<3)
+#define RSMP_SSE2 (2<<3)
+//#define RSMP_SSSE3 (3<<3)
+#define RSMP_SSE (4<<3)
+#define RSMP_SSE3 (5<<3)
+#define RSMP_3DNOW (6<<3)
+#define RSMP_CPUCAPS (7<<3)
+#endif
+
// Defines for sloppy or exact resampling
#define FREQ_SLOPPY (0<<2)
#define FREQ_EXACT (1<<2)
@@ -166,6 +178,16 @@
data->format = af->data->format;
data->bps = af->data->bps;
af->data->nch = data->nch;
+#if HAVE_MMX
+ if ((s->setup & RSMP_INT) && !(s->setup & RSMP_CPUCAPS)){
+ s->setup |= (gCpuCaps.hasSSE2?RSMP_SSE2:RSMP_MMX);
+ af_msg(AF_MSG_INFO,"[resample] Using %s optimized resampler\n",gCpuCaps.hasSSE2?"SSE2":"MMX");
+ }
+ else if ((s->setup & RSMP_FLOAT) && !(s->setup & RSMP_CPUCAPS)){
+ s->setup |= (gCpuCaps.hasSSE3?RSMP_SSE3:(gCpuCaps.hasSSE?RSMP_SSE:(gCpuCaps.has3DNow?RSMP_3DNOW:0)));
+ af_msg(AF_MSG_INFO,"[resample] Using %s optimized resampler\n",gCpuCaps.hasSSE3?"SSE3":(gCpuCaps.hasSSE?"SSE":(gCpuCaps.has3DNow?"3DNow!":"non")));
+ }
+#endif
return rv;
}
@@ -329,9 +351,9 @@
return NULL;
// Run resampling
+#if !(HAVE_MMX)
switch(s->setup & RSMP_MASK){
case(RSMP_INT):
-# define FORMAT_I 1
if(s->up>s->dn){
# define UP
# include "af_resample_template.c"
@@ -357,6 +379,94 @@
# undef DN
}
break;
+#else
+ switch(s->setup & (RSMP_MASK | RSMP_CPUCAPS)){
+# define FORMAT_I 1
+ case(RSMP_INT | RSMP_MMX):
+ if(s->up>s->dn){
+# define UP
+# include "af_resample_template.c"
+# undef UP
+ }
+ else{
+# define DN
+# include "af_resample_template.c"
+# undef DN
+ }
+ __asm__ volatile("emms \n\t");
+ break;
+ case(RSMP_INT | RSMP_SSE2):
+# define FIRSSE2
+ if(s->up>s->dn){
+# define UP
+# include "af_resample_template.c"
+# undef UP
+ }
+ else{
+# define DN
+# include "af_resample_template.c"
+# undef DN
+ }
+# undef FIRSSE2
+ break;
+# undef FORMAT_I
+# define FORMAT_F 1
+ case(RSMP_FLOAT | RSMP_SSE3):
+# define FIRSSE3
+ if(s->up>s->dn){
+# define UP
+# include "af_resample_template.c"
+# undef UP
+ }
+ else{
+# define DN
+# include "af_resample_template.c"
+# undef DN
+ }
+# undef FIRSSE3
+ break;
+ case(RSMP_FLOAT | RSMP_SSE):
+# define FIRSSE
+ if(s->up>s->dn){
+# define UP
+# include "af_resample_template.c"
+# undef UP
+ }
+ else{
+# define DN
+# include "af_resample_template.c"
+# undef DN
+ }
+# undef FIRSSE
+ break;
+ case(RSMP_FLOAT | RSMP_3DNOW):
+# define FIR3DNOW
+ if(s->up>s->dn){
+# define UP
+# include "af_resample_template.c"
+# undef UP
+ }
+ else{
+# define DN
+# include "af_resample_template.c"
+# undef DN
+ }
+ __asm__ volatile("emms \n\t");
+# undef FIR3DNOW
+ break;
+ case(RSMP_FLOAT):
+ if(s->up>s->dn){
+# define UP
+# include "af_resample_template.c"
+# undef UP
+ }
+ else{
+# define DN
+# include "af_resample_template.c"
+# undef DN
+ }
+ break;
+#endif
case(RSMP_LIN):
len = linint(c, l, s);
break;
Index: libaf/af_resample_template.c
===================================================================
--- libaf/af_resample_template.c (revision 28929)
+++ libaf/af_resample_template.c (working copy)
@@ -65,13 +65,157 @@
#else /* L8/L16 */
#define L 16
+#ifdef FORMAT_I
+#ifdef FIRSSE2
+#define FIR(x,w,y) \
+ __asm__(\
+ "movups (%%"REG_d"), %%xmm0 \n\t"\
+ "movups (%%"REG_c"), %%xmm1 \n\t"\
+ "movups 16(%%"REG_d"), %%xmm2 \n\t"\
+ "movups 16(%%"REG_c"), %%xmm3 \n\t"\
+ "pmaddwd %%xmm1, %%xmm0 \n\t"\
+ "pmaddwd %%xmm3, %%xmm2 \n\t"\
+ "paddd %%xmm2, %%xmm0 \n\t"\
+ "movhlps %%xmm0, %%xmm1 \n\t"\
+ "paddd %%xmm1, %%xmm0 \n\t"\
+ "pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
+ "paddd %%xmm1, %%xmm0 \n\t"\
+ "movd %%xmm0, %%ebx \n\t"\
+ "shrl $16, %%ebx \n\t"\
+ "movw %%bx, (%%"REG_a") \n\t"\
+ :\
+ : "a"(y),"d"(w),"c"(x)\
+ : "%ebx"\
+ )
+#else
+#define FIR(x,w,y) \
+ __asm__(\
+ "movq (%%"REG_d"), %%mm0 \n\t"\
+ "movq (%%"REG_c"), %%mm1 \n\t"\
+ "movq 8(%%"REG_d"), %%mm2 \n\t"\
+ "movq 8(%%"REG_c"), %%mm3 \n\t"\
+ "pmaddwd %%mm1, %%mm0 \n\t"\
+ "pmaddwd %%mm3, %%mm2 \n\t"\
+ "movq 16(%%"REG_d"), %%mm4 \n\t"\
+ "movq 16(%%"REG_c"), %%mm5 \n\t"\
+ "movq 24(%%"REG_d"), %%mm6 \n\t"\
+ "movq 24(%%"REG_c"), %%mm7 \n\t"\
+ "pmaddwd %%mm5, %%mm4 \n\t"\
+ "pmaddwd %%mm7, %%mm6 \n\t"\
+ "paddd %%mm2, %%mm0 \n\t"\
+ "paddd %%mm6, %%mm4 \n\t"\
+ "paddd %%mm4, %%mm0 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "punpckhdq %%mm0, %%mm0 \n\t"\
+ "paddd %%mm1, %%mm0 \n\t"\
+ "movd %%mm0, %%ebx \n\t"\
+ "shrl $16, %%ebx \n\t"\
+ "movw %%bx, (%%"REG_a") \n\t"\
+ :\
+ : "a"(y),"d"(w),"c"(x)\
+ : "%ebx"\
+ )
+#endif // FIRSSE2
+#else // FORMAT_I
+#ifdef FIRSSE
+#define FIR(x,w,y) \
+ __asm__(\
+ "movups (%%"REG_d"), %%xmm0 \n\t"\
+ "movups (%%"REG_c"), %%xmm1 \n\t"\
+ "movups 16(%%"REG_d"), %%xmm2 \n\t"\
+ "movups 16(%%"REG_c"), %%xmm3 \n\t"\
+ "mulps %%xmm1, %%xmm0 \n\t"\
+ "mulps %%xmm3, %%xmm2 \n\t"\
+ "movups 32(%%"REG_d"), %%xmm4 \n\t"\
+ "movups 32(%%"REG_c"), %%xmm5 \n\t"\
+ "addps %%xmm2, %%xmm0 \n\t"\
+ "movups 48(%%"REG_d"), %%xmm6 \n\t"\
+ "movups 48(%%"REG_c"), %%xmm7 \n\t"\
+ "mulps %%xmm5, %%xmm4 \n\t"\
+ "mulps %%xmm7, %%xmm6 \n\t"\
+ "addps %%xmm6, %%xmm4 \n\t"\
+ "addps %%xmm4, %%xmm0 \n\t"\
+ "movhlps %%xmm0, %%xmm1 \n\t"\
+ "addps %%xmm1, %%xmm0 \n\t"\
+ "movaps %%xmm0, %%xmm1 \n\t"\
+ "shufps $0x01, %%xmm0, %%xmm0 \n\t"\
+ "addps %%xmm1, %%xmm0 \n\t"\
+ "movss %%xmm0, (%%"REG_a") \n\t"\
+ :\
+ : "a"(y),"d"(w),"c"(x)\
+ )
+#elif defined (FIRSSE3)
+#define FIR(x,w,y) \
+ __asm__(\
+ "lddqu (%%"REG_d"), %%xmm0 \n\t"\
+ "lddqu (%%"REG_c"), %%xmm1 \n\t"\
+ "lddqu 16(%%"REG_d"), %%xmm2 \n\t"\
+ "lddqu 16(%%"REG_c"), %%xmm3 \n\t"\
+ "mulps %%xmm1, %%xmm0 \n\t"\
+ "mulps %%xmm3, %%xmm2 \n\t"\
+ "lddqu 32(%%"REG_d"), %%xmm4 \n\t"\
+ "lddqu 32(%%"REG_c"), %%xmm5 \n\t"\
+ "addps %%xmm2, %%xmm0 \n\t"\
+ "lddqu 48(%%"REG_d"), %%xmm6 \n\t"\
+ "lddqu 48(%%"REG_c"), %%xmm7 \n\t"\
+ "mulps %%xmm5, %%xmm4 \n\t"\
+ "mulps %%xmm7, %%xmm6 \n\t"\
+ "addps %%xmm6, %%xmm4 \n\t"\
+ "addps %%xmm4, %%xmm0 \n\t"\
+ "haddps %%xmm0, %%xmm0 \n\t"\
+ "haddps %%xmm0, %%xmm0 \n\t"\
+ "movss %%xmm0, (%%"REG_a") \n\t"\
+ :\
+ : "a"(y),"d"(w),"c"(x)\
+ )
+#elif defined(FIR3DNOW)
+#define FIR(x,w,y)\
+ __asm__(\
+ "movq (%%"REG_d"), %%mm0 \n\t"\
+ "movq (%%"REG_c"), %%mm1 \n\t"\
+ "movq 8(%%"REG_d"), %%mm2 \n\t"\
+ "movq 8(%%"REG_c"), %%mm3 \n\t"\
+ "pfmul %%mm1, %%mm0 \n\t"\
+ "pfmul %%mm3, %%mm2 \n\t"\
+ "movq 16(%%"REG_d"), %%mm4 \n\t"\
+ "movq 16(%%"REG_c"), %%mm5 \n\t"\
+ "pfadd %%mm2, %%mm0 \n\t"\
+ "movq 24(%%"REG_d"), %%mm6 \n\t"\
+ "movq 24(%%"REG_c"), %%mm7 \n\t"\
+ "pfmul %%mm5, %%mm4 \n\t"\
+ "pfmul %%mm7, %%mm6 \n\t"\
+ "movq 32(%%"REG_d"), %%mm1 \n\t"\
+ "movq 32(%%"REG_c"), %%mm2 \n\t"\
+ "pfadd %%mm6, %%mm4 \n\t"\
+ "pfmul %%mm2, %%mm1 \n\t"\
+ "movq 40(%%"REG_d"), %%mm2 \n\t"\
+ "movq 40(%%"REG_c"), %%mm3 \n\t"\
+ "pfadd %%mm4, %%mm0 \n\t"\
+ "pfmul %%mm3, %%mm2 \n\t"\
+ "movq 48(%%"REG_d"), %%mm4 \n\t"\
+ "movq 48(%%"REG_c"), %%mm5 \n\t"\
+ "pfadd %%mm2, %%mm1 \n\t"\
+ "pfmul %%mm5, %%mm4 \n\t"\
+ "movq 56(%%"REG_d"), %%mm6 \n\t"\
+ "movq 56(%%"REG_c"), %%mm7 \n\t"\
+ "pfadd %%mm1, %%mm0 \n\t"\
+ "pfmul %%mm7, %%mm6 \n\t"\
+ "pfadd %%mm6, %%mm4 \n\t"\
+ "pfadd %%mm4, %%mm0 \n\t"\
+ "pfacc %%mm0, %%mm0 \n\t"\
+ "movd %%mm0, (%%"REG_a") \n\t"\
+ :\
+ : "a"(y),"d"(w),"c"(x)\
+ )
+#else // FIRSSE
// Unrolled loop to speed up execution
#define FIR(x,w,y) \
y[0] = ( w[0] *x[0] +w[1] *x[1] +w[2] *x[2] +w[3] *x[3] \
+ w[4] *x[4] +w[5] *x[5] +w[6] *x[6] +w[7] *x[7] \
+ w[8] *x[8] +w[9] *x[9] +w[10]*x[10]+w[11]*x[11] \
+ w[12]*x[12]+w[13]*x[13]+w[14]*x[14]+w[15]*x[15] ) SHIFT
-
+#endif // FIRSSE
+#endif // FORMAT_I
#endif /* L8/L16 */
// Macro to add data to circular que
Regards,
Zhou Zongyi
More information about the MPlayer-dev-eng
mailing list