[FFmpeg-devel] [PATCH] Optimization for add_8x8basis

Zuxy Meng zuxy.meng
Sun May 13 15:56:28 CEST 2007


Hi,

3DNow! and SSSE3 provide instructions for packed mulplication with
rounding (pmulhrw and pmulhrsw respectively) which can be used to
replace pmulhw+paddw+psarw.

Someone willing to test it on Core 2?
-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c	?????? 9014??
+++ libavcodec/i386/dsputil_mmx.c	????????????
@@ -2750,6 +2750,17 @@
 }
 
 #ifdef CONFIG_ENCODERS
+#define SETW_ONE_MMX(x)\
+    "pcmpeqw " #x ", " #x "          \n\t"\
+    "psrlw $15, " #x "               \n\t"
+#define PMULHRW(x, y, s, o)\
+    "pmulhw " #s ", "#x "            \n\t"\
+    "pmulhw " #s ", "#y "            \n\t"\
+    "paddw " #o ", "#x "             \n\t"\
+    "paddw " #o ", "#y "             \n\t"\
+    "psraw $1, "#x "                 \n\t"\
+    "psraw $1, "#y "                 \n\t"
+
 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
     long i=0;
 
@@ -2757,8 +2768,7 @@
     scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
 
     asm volatile(
-        "pcmpeqw %%mm6, %%mm6           \n\t" // -1w
-        "psrlw $15, %%mm6               \n\t" //  1w
+        SETW_ONE_MMX(%%mm6)
         "pxor %%mm7, %%mm7              \n\t"
         "movd  %4, %%mm5                \n\t"
         "punpcklwd %%mm5, %%mm5         \n\t"
@@ -2766,12 +2776,7 @@
         "1:                             \n\t"
         "movq  (%1, %0), %%mm0          \n\t"
         "movq  8(%1, %0), %%mm1         \n\t"
-        "pmulhw %%mm5, %%mm0            \n\t"
-        "pmulhw %%mm5, %%mm1            \n\t"
-        "paddw %%mm6, %%mm0             \n\t"
-        "paddw %%mm6, %%mm1             \n\t"
-        "psraw $1, %%mm0                \n\t"
-        "psraw $1, %%mm1                \n\t"
+        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
         "paddw (%2, %0), %%mm0          \n\t"
         "paddw 8(%2, %0), %%mm1         \n\t"
         "psraw $6, %%mm0                \n\t"
@@ -2798,43 +2803,58 @@
     return i;
 }
 
-static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
-    long i=0;
+#define ADD_8X8BASIS(cpu, max_abs, offset)\
+static void add_8x8basis_##cpu(int16_t rem[64], int16_t basis[64], int scale){\
+    long i=0;\
+\
+    if(FFABS(scale) < max_abs){\
+        scale<<= 16 + offset - BASIS_SHIFT + RECON_SHIFT;\
+        asm volatile(\
+                SETW_ONE_MMX(%%mm6)\
+                "movd  %3, %%mm5        \n\t"\
+                "punpcklwd %%mm5, %%mm5 \n\t"\
+                "punpcklwd %%mm5, %%mm5 \n\t"\
+		ASMALIGN(4)\
+                "1:                     \n\t"\
+                "movq  (%1, %0), %%mm0  \n\t"\
+                "movq  8(%1, %0), %%mm1 \n\t"\
+                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)\
+                "paddw (%2, %0), %%mm0  \n\t"\
+                "paddw 8(%2, %0), %%mm1 \n\t"\
+                "movq %%mm0, (%2, %0)   \n\t"\
+                "movq %%mm1, 8(%2, %0)  \n\t"\
+                "add $16, %0            \n\t"\
+                "cmp $128, %0           \n\t" /* FIXME optimize & bench */\
+                " jb 1b                 \n\t"\
+\
+                : "+r" (i)\
+                : "r"(basis), "r"(rem), "g"(scale)\
+        );\
+    }else{\
+        for(i=0; i<8*8; i++){\
+            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);\
+        }\
+    }\
+}
 
-    if(FFABS(scale) < 256){
-        scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
-        asm volatile(
-                "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
-                "psrlw $15, %%mm6       \n\t" //  1w
-                "movd  %3, %%mm5        \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                "1:                     \n\t"
-                "movq  (%1, %0), %%mm0  \n\t"
-                "movq  8(%1, %0), %%mm1 \n\t"
-                "pmulhw %%mm5, %%mm0    \n\t"
-                "pmulhw %%mm5, %%mm1    \n\t"
-                "paddw %%mm6, %%mm0     \n\t"
-                "paddw %%mm6, %%mm1     \n\t"
-                "psraw $1, %%mm0        \n\t"
-                "psraw $1, %%mm1        \n\t"
-                "paddw (%2, %0), %%mm0  \n\t"
-                "paddw 8(%2, %0), %%mm1 \n\t"
-                "movq %%mm0, (%2, %0)   \n\t"
-                "movq %%mm1, 8(%2, %0)  \n\t"
-                "add $16, %0            \n\t"
-                "cmp $128, %0           \n\t" //FIXME optimize & bench
-                " jb 1b                 \n\t"
+ADD_8X8BASIS(mmx, 256, 1)
 
-                : "+r" (i)
-                : "r"(basis), "r"(rem), "g"(scale)
-        );
-    }else{
-        for(i=0; i<8*8; i++){
-            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-        }
-    }
-}
+#undef SETW_ONE_MMX
+#undef PMULHRW
+#define SETW_ONE_MMX(x)
+#define PMULHRW(x, y, s, o)\
+    "pmulhrw " #s ", "#x "           \n\t"\
+    "pmulhrw " #s ", "#y "           \n\t"
+ADD_8X8BASIS(3dnow, 512, 0)
+
+#ifdef HAVE_SSSE3
+#undef PMULHRW
+#define PMULHRW(x, y, s, o)\
+    "pmulhrsw " #s ", "#x "          \n\t"\
+    "pmulhrsw " #s ", "#y "          \n\t"
+ADD_8X8BASIS(ssse3, 512, -1)
+#endif //HAVE_SSSE3
+
 #endif /* CONFIG_ENCODERS */
 
 #define PREFETCH(name, op) \
@@ -3646,6 +3666,7 @@
 
 #ifdef HAVE_SSSE3
         if(mm_flags & MM_SSSE3){
+            c->add_8x8basis= add_8x8basis_ssse3;
             c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
             c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
             c->hadamard8_diff[1]= hadamard8_diff_ssse3;
@@ -3667,6 +3688,9 @@
 #endif
 
         if(mm_flags & MM_3DNOW){
+#ifdef CONFIG_ENCODERS
+            c->add_8x8basis= add_8x8basis_3dnow;
+#endif //CONFIG_ENCODERS
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
             c->vector_fmul = vector_fmul_3dnow;
             if(!(avctx->flags & CODEC_FLAG_BITEXACT))



More information about the ffmpeg-devel mailing list