[FFmpeg-devel] [PATCH] Optimization for add_8x8basis

Michael Niedermayer michaelni
Sun May 13 23:34:01 CEST 2007


Hi

On Sun, May 13, 2007 at 09:56:28PM +0800, Zuxy Meng wrote:
> Hi,
> 
> 3DNow! and SSSE3 provide instructions for packed mulplication with
> rounding (pmulhrw and pmulhrsw respectively) which can be used to
> replace pmulhw+paddw+psarw.
> 
> Someone willing to test it on Core 2?
[...]
> -static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
> -    long i=0;
> +#define ADD_8X8BASIS(cpu, max_abs, offset)\
> +static void add_8x8basis_##cpu(int16_t rem[64], int16_t basis[64], int scale){\
> +    long i=0;\
> +\
> +    if(FFABS(scale) < max_abs){\
> +        scale<<= 16 + offset - BASIS_SHIFT + RECON_SHIFT;\
> +        asm volatile(\
> +                SETW_ONE_MMX(%%mm6)\
> +                "movd  %3, %%mm5        \n\t"\
> +                "punpcklwd %%mm5, %%mm5 \n\t"\
> +                "punpcklwd %%mm5, %%mm5 \n\t"\
> +		ASMALIGN(4)\
> +                "1:                     \n\t"\
> +                "movq  (%1, %0), %%mm0  \n\t"\
> +                "movq  8(%1, %0), %%mm1 \n\t"\
> +                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)\
> +                "paddw (%2, %0), %%mm0  \n\t"\
> +                "paddw 8(%2, %0), %%mm1 \n\t"\
> +                "movq %%mm0, (%2, %0)   \n\t"\
> +                "movq %%mm1, 8(%2, %0)  \n\t"\
> +                "add $16, %0            \n\t"\
> +                "cmp $128, %0           \n\t" /* FIXME optimize & bench */\
> +                " jb 1b                 \n\t"\
> +\
> +                : "+r" (i)\
> +                : "r"(basis), "r"(rem), "g"(scale)\
> +        );\
> +    }else{\
> +        for(i=0; i<8*8; i++){\
> +            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);\
> +        }\
> +    }\
> +}
>  
> -    if(FFABS(scale) < 256){
> -        scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
> -        asm volatile(
> -                "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
> -                "psrlw $15, %%mm6       \n\t" //  1w
> -                "movd  %3, %%mm5        \n\t"
> -                "punpcklwd %%mm5, %%mm5 \n\t"
> -                "punpcklwd %%mm5, %%mm5 \n\t"
> -                "1:                     \n\t"
> -                "movq  (%1, %0), %%mm0  \n\t"
> -                "movq  8(%1, %0), %%mm1 \n\t"
> -                "pmulhw %%mm5, %%mm0    \n\t"
> -                "pmulhw %%mm5, %%mm1    \n\t"
> -                "paddw %%mm6, %%mm0     \n\t"
> -                "paddw %%mm6, %%mm1     \n\t"
> -                "psraw $1, %%mm0        \n\t"
> -                "psraw $1, %%mm1        \n\t"
> -                "paddw (%2, %0), %%mm0  \n\t"
> -                "paddw 8(%2, %0), %%mm1 \n\t"
> -                "movq %%mm0, (%2, %0)   \n\t"
> -                "movq %%mm1, 8(%2, %0)  \n\t"
> -                "add $16, %0            \n\t"
> -                "cmp $128, %0           \n\t" //FIXME optimize & bench
> -                " jb 1b                 \n\t"
> +ADD_8X8BASIS(mmx, 256, 1)
>  
> -                : "+r" (i)
> -                : "r"(basis), "r"(rem), "g"(scale)
> -        );
> -    }else{
> -        for(i=0; i<8*8; i++){
> -            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
> -        }
> -    }
> -}
> +#undef SETW_ONE_MMX
> +#undef PMULHRW
> +#define SETW_ONE_MMX(x)
> +#define PMULHRW(x, y, s, o)\
> +    "pmulhrw " #s ", "#x "           \n\t"\
> +    "pmulhrw " #s ", "#y "           \n\t"
> +ADD_8X8BASIS(3dnow, 512, 0)
> +
> +#ifdef HAVE_SSSE3
> +#undef PMULHRW
> +#define PMULHRW(x, y, s, o)\
> +    "pmulhrsw " #s ", "#x "          \n\t"\
> +    "pmulhrsw " #s ", "#y "          \n\t"
> +ADD_8X8BASIS(ssse3, 512, -1)
> +#endif //HAVE_SSSE3

this is messy, especially if its done with many functions
(i know that as i did it with some of the motion compensation code ...)

i would suggest that the function is put in a file
and that #included 3 times with the different defines

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Let us carefully observe those good qualities wherein our enemies excel us
and endeavor to excel them, by avoiding what is faulty, and imitating what
is excellent in them. -- Plutarch
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20070513/557a7149/attachment.pgp>



More information about the ffmpeg-devel mailing list