[FFmpeg-devel] [PATCH] Optimization for add_8x8basis
Zuxy Meng
zuxy.meng
Sun May 13 15:56:28 CEST 2007
Hi,
3DNow! and SSSE3 provide instructions for packed mulplication with
rounding (pmulhrw and pmulhrsw respectively) which can be used to
replace pmulhw+paddw+psarw.
Someone willing to test it on Core 2?
--
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c ?????? 9014??
+++ libavcodec/i386/dsputil_mmx.c ????????????
@@ -2750,6 +2750,17 @@
}
#ifdef CONFIG_ENCODERS
+#define SETW_ONE_MMX(x)\
+ "pcmpeqw " #x ", " #x " \n\t"\
+ "psrlw $15, " #x " \n\t"
+#define PMULHRW(x, y, s, o)\
+ "pmulhw " #s ", "#x " \n\t"\
+ "pmulhw " #s ", "#y " \n\t"\
+ "paddw " #o ", "#x " \n\t"\
+ "paddw " #o ", "#y " \n\t"\
+ "psraw $1, "#x " \n\t"\
+ "psraw $1, "#y " \n\t"
+
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
long i=0;
@@ -2757,8 +2768,7 @@
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
asm volatile(
- "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
- "psrlw $15, %%mm6 \n\t" // 1w
+ SETW_ONE_MMX(%%mm6)
"pxor %%mm7, %%mm7 \n\t"
"movd %4, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
@@ -2766,12 +2776,7 @@
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
- "pmulhw %%mm5, %%mm0 \n\t"
- "pmulhw %%mm5, %%mm1 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"psraw $6, %%mm0 \n\t"
@@ -2798,43 +2803,58 @@
return i;
}
-static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
- long i=0;
+#define ADD_8X8BASIS(cpu, max_abs, offset)\
+static void add_8x8basis_##cpu(int16_t rem[64], int16_t basis[64], int scale){\
+ long i=0;\
+\
+ if(FFABS(scale) < max_abs){\
+ scale<<= 16 + offset - BASIS_SHIFT + RECON_SHIFT;\
+ asm volatile(\
+ SETW_ONE_MMX(%%mm6)\
+ "movd %3, %%mm5 \n\t"\
+ "punpcklwd %%mm5, %%mm5 \n\t"\
+ "punpcklwd %%mm5, %%mm5 \n\t"\
+ ASMALIGN(4)\
+ "1: \n\t"\
+ "movq (%1, %0), %%mm0 \n\t"\
+ "movq 8(%1, %0), %%mm1 \n\t"\
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)\
+ "paddw (%2, %0), %%mm0 \n\t"\
+ "paddw 8(%2, %0), %%mm1 \n\t"\
+ "movq %%mm0, (%2, %0) \n\t"\
+ "movq %%mm1, 8(%2, %0) \n\t"\
+ "add $16, %0 \n\t"\
+ "cmp $128, %0 \n\t" /* FIXME optimize & bench */\
+ " jb 1b \n\t"\
+\
+ : "+r" (i)\
+ : "r"(basis), "r"(rem), "g"(scale)\
+ );\
+ }else{\
+ for(i=0; i<8*8; i++){\
+ rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);\
+ }\
+ }\
+}
- if(FFABS(scale) < 256){
- scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
- asm volatile(
- "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
- "psrlw $15, %%mm6 \n\t" // 1w
- "movd %3, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "pmulhw %%mm5, %%mm0 \n\t"
- "pmulhw %%mm5, %%mm1 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm1 \n\t"
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "movq %%mm0, (%2, %0) \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
+ADD_8X8BASIS(mmx, 256, 1)
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- );
- }else{
- for(i=0; i<8*8; i++){
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
+#undef SETW_ONE_MMX
+#undef PMULHRW
+#define SETW_ONE_MMX(x)
+#define PMULHRW(x, y, s, o)\
+ "pmulhrw " #s ", "#x " \n\t"\
+ "pmulhrw " #s ", "#y " \n\t"
+ADD_8X8BASIS(3dnow, 512, 0)
+
+#ifdef HAVE_SSSE3
+#undef PMULHRW
+#define PMULHRW(x, y, s, o)\
+ "pmulhrsw " #s ", "#x " \n\t"\
+ "pmulhrsw " #s ", "#y " \n\t"
+ADD_8X8BASIS(ssse3, 512, -1)
+#endif //HAVE_SSSE3
+
#endif /* CONFIG_ENCODERS */
#define PREFETCH(name, op) \
@@ -3646,6 +3666,7 @@
#ifdef HAVE_SSSE3
if(mm_flags & MM_SSSE3){
+ c->add_8x8basis= add_8x8basis_ssse3;
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
@@ -3667,6 +3688,9 @@
#endif
if(mm_flags & MM_3DNOW){
+#ifdef CONFIG_ENCODERS
+ c->add_8x8basis= add_8x8basis_3dnow;
+#endif //CONFIG_ENCODERS
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT))
More information about the ffmpeg-devel
mailing list