[FFmpeg-devel] [PATCH] Optimization for add_8x8basis
Michael Niedermayer
michaelni
Sun May 13 23:34:01 CEST 2007
Hi
On Sun, May 13, 2007 at 09:56:28PM +0800, Zuxy Meng wrote:
> Hi,
>
> 3DNow! and SSSE3 provide instructions for packed mulplication with
> rounding (pmulhrw and pmulhrsw respectively) which can be used to
> replace pmulhw+paddw+psarw.
>
> Someone willing to test it on Core 2?
[...]
> -static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
> - long i=0;
> +#define ADD_8X8BASIS(cpu, max_abs, offset)\
> +static void add_8x8basis_##cpu(int16_t rem[64], int16_t basis[64], int scale){\
> + long i=0;\
> +\
> + if(FFABS(scale) < max_abs){\
> + scale<<= 16 + offset - BASIS_SHIFT + RECON_SHIFT;\
> + asm volatile(\
> + SETW_ONE_MMX(%%mm6)\
> + "movd %3, %%mm5 \n\t"\
> + "punpcklwd %%mm5, %%mm5 \n\t"\
> + "punpcklwd %%mm5, %%mm5 \n\t"\
> + ASMALIGN(4)\
> + "1: \n\t"\
> + "movq (%1, %0), %%mm0 \n\t"\
> + "movq 8(%1, %0), %%mm1 \n\t"\
> + PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)\
> + "paddw (%2, %0), %%mm0 \n\t"\
> + "paddw 8(%2, %0), %%mm1 \n\t"\
> + "movq %%mm0, (%2, %0) \n\t"\
> + "movq %%mm1, 8(%2, %0) \n\t"\
> + "add $16, %0 \n\t"\
> + "cmp $128, %0 \n\t" /* FIXME optimize & bench */\
> + " jb 1b \n\t"\
> +\
> + : "+r" (i)\
> + : "r"(basis), "r"(rem), "g"(scale)\
> + );\
> + }else{\
> + for(i=0; i<8*8; i++){\
> + rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);\
> + }\
> + }\
> +}
>
> - if(FFABS(scale) < 256){
> - scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
> - asm volatile(
> - "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
> - "psrlw $15, %%mm6 \n\t" // 1w
> - "movd %3, %%mm5 \n\t"
> - "punpcklwd %%mm5, %%mm5 \n\t"
> - "punpcklwd %%mm5, %%mm5 \n\t"
> - "1: \n\t"
> - "movq (%1, %0), %%mm0 \n\t"
> - "movq 8(%1, %0), %%mm1 \n\t"
> - "pmulhw %%mm5, %%mm0 \n\t"
> - "pmulhw %%mm5, %%mm1 \n\t"
> - "paddw %%mm6, %%mm0 \n\t"
> - "paddw %%mm6, %%mm1 \n\t"
> - "psraw $1, %%mm0 \n\t"
> - "psraw $1, %%mm1 \n\t"
> - "paddw (%2, %0), %%mm0 \n\t"
> - "paddw 8(%2, %0), %%mm1 \n\t"
> - "movq %%mm0, (%2, %0) \n\t"
> - "movq %%mm1, 8(%2, %0) \n\t"
> - "add $16, %0 \n\t"
> - "cmp $128, %0 \n\t" //FIXME optimize & bench
> - " jb 1b \n\t"
> +ADD_8X8BASIS(mmx, 256, 1)
>
> - : "+r" (i)
> - : "r"(basis), "r"(rem), "g"(scale)
> - );
> - }else{
> - for(i=0; i<8*8; i++){
> - rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
> - }
> - }
> -}
> +#undef SETW_ONE_MMX
> +#undef PMULHRW
> +#define SETW_ONE_MMX(x)
> +#define PMULHRW(x, y, s, o)\
> + "pmulhrw " #s ", "#x " \n\t"\
> + "pmulhrw " #s ", "#y " \n\t"
> +ADD_8X8BASIS(3dnow, 512, 0)
> +
> +#ifdef HAVE_SSSE3
> +#undef PMULHRW
> +#define PMULHRW(x, y, s, o)\
> + "pmulhrsw " #s ", "#x " \n\t"\
> + "pmulhrsw " #s ", "#y " \n\t"
> +ADD_8X8BASIS(ssse3, 512, -1)
> +#endif //HAVE_SSSE3
this is messy, especially if its done with many functions
(i know that as i did it with some of the motion compensation code ...)
i would suggest that the function is put in a file
and that #included 3 times with the different defines
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Let us carefully observe those good qualities wherein our enemies excel us
and endeavor to excel them, by avoiding what is faulty, and imitating what
is excellent in them. -- Plutarch
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20070513/557a7149/attachment.pgp>
More information about the ffmpeg-devel
mailing list