[FFmpeg-devel] [PATCH] Optimization for add_8x8basis

Zuxy Meng zuxy.meng
Mon May 14 13:22:11 CEST 2007


Hi,

2007/5/14, Michael Niedermayer <michaelni at gmx.at>:
> Hi
>
> On Sun, May 13, 2007 at 09:56:28PM +0800, Zuxy Meng wrote:
> > Hi,
> >
> > 3DNow! and SSSE3 provide instructions for packed mulplication with
> > rounding (pmulhrw and pmulhrsw respectively) which can be used to
> > replace pmulhw+paddw+psarw.
> >
> > Someone willing to test it on Core 2?
> [...]
> > -static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
> > -    long i=0;
> > +#define ADD_8X8BASIS(cpu, max_abs, offset)\
> > +static void add_8x8basis_##cpu(int16_t rem[64], int16_t basis[64], int scale){\
> > +    long i=0;\
> > +\
> > +    if(FFABS(scale) < max_abs){\
> > +        scale<<= 16 + offset - BASIS_SHIFT + RECON_SHIFT;\
> > +        asm volatile(\
> > +                SETW_ONE_MMX(%%mm6)\
> > +                "movd  %3, %%mm5        \n\t"\
> > +                "punpcklwd %%mm5, %%mm5 \n\t"\
> > +                "punpcklwd %%mm5, %%mm5 \n\t"\
> > +             ASMALIGN(4)\
> > +                "1:                     \n\t"\
> > +                "movq  (%1, %0), %%mm0  \n\t"\
> > +                "movq  8(%1, %0), %%mm1 \n\t"\
> > +                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)\
> > +                "paddw (%2, %0), %%mm0  \n\t"\
> > +                "paddw 8(%2, %0), %%mm1 \n\t"\
> > +                "movq %%mm0, (%2, %0)   \n\t"\
> > +                "movq %%mm1, 8(%2, %0)  \n\t"\
> > +                "add $16, %0            \n\t"\
> > +                "cmp $128, %0           \n\t" /* FIXME optimize & bench */\
> > +                " jb 1b                 \n\t"\
> > +\
> > +                : "+r" (i)\
> > +                : "r"(basis), "r"(rem), "g"(scale)\
> > +        );\
> > +    }else{\
> > +        for(i=0; i<8*8; i++){\
> > +            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);\
> > +        }\
> > +    }\
> > +}
> >
> > -    if(FFABS(scale) < 256){
> > -        scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
> > -        asm volatile(
> > -                "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
> > -                "psrlw $15, %%mm6       \n\t" //  1w
> > -                "movd  %3, %%mm5        \n\t"
> > -                "punpcklwd %%mm5, %%mm5 \n\t"
> > -                "punpcklwd %%mm5, %%mm5 \n\t"
> > -                "1:                     \n\t"
> > -                "movq  (%1, %0), %%mm0  \n\t"
> > -                "movq  8(%1, %0), %%mm1 \n\t"
> > -                "pmulhw %%mm5, %%mm0    \n\t"
> > -                "pmulhw %%mm5, %%mm1    \n\t"
> > -                "paddw %%mm6, %%mm0     \n\t"
> > -                "paddw %%mm6, %%mm1     \n\t"
> > -                "psraw $1, %%mm0        \n\t"
> > -                "psraw $1, %%mm1        \n\t"
> > -                "paddw (%2, %0), %%mm0  \n\t"
> > -                "paddw 8(%2, %0), %%mm1 \n\t"
> > -                "movq %%mm0, (%2, %0)   \n\t"
> > -                "movq %%mm1, 8(%2, %0)  \n\t"
> > -                "add $16, %0            \n\t"
> > -                "cmp $128, %0           \n\t" //FIXME optimize & bench
> > -                " jb 1b                 \n\t"
> > +ADD_8X8BASIS(mmx, 256, 1)
> >
> > -                : "+r" (i)
> > -                : "r"(basis), "r"(rem), "g"(scale)
> > -        );
> > -    }else{
> > -        for(i=0; i<8*8; i++){
> > -            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
> > -        }
> > -    }
> > -}
> > +#undef SETW_ONE_MMX
> > +#undef PMULHRW
> > +#define SETW_ONE_MMX(x)
> > +#define PMULHRW(x, y, s, o)\
> > +    "pmulhrw " #s ", "#x "           \n\t"\
> > +    "pmulhrw " #s ", "#y "           \n\t"
> > +ADD_8X8BASIS(3dnow, 512, 0)
> > +
> > +#ifdef HAVE_SSSE3
> > +#undef PMULHRW
> > +#define PMULHRW(x, y, s, o)\
> > +    "pmulhrsw " #s ", "#x "          \n\t"\
> > +    "pmulhrsw " #s ", "#y "          \n\t"
> > +ADD_8X8BASIS(ssse3, 512, -1)
> > +#endif //HAVE_SSSE3
>
> this is messy, especially if its done with many functions
> (i know that as i did it with some of the motion compensation code ...)
>
> i would suggest that the function is put in a file
> and that #included 3 times with the different defines

Wouldn't a new file for just one function (or two, if try_8x8basis is
added later) be considered a bit costly?
-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6




More information about the ffmpeg-devel mailing list