[FFmpeg-devel] [PATCH] MMX/SSE2 qpel functions for RV40

Sun Jan 4 23:36:29 CET 2009

On Jan 4, 2009, at 3:45 PM, Mathieu Velten wrote:

> patch attached.
>
> more than 15% speedup on my sample.
>
> Mathieu Velten

> +static av_noinline void OPNAME ## rv40_qpel8or16_v_lowpass_ ##  
> MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,\
> +                                                                int  
> h, const xmm_reg *C1_reg, const xmm_reg *C2_reg,\
> +                                                                 
> const xmm_reg *rnd_reg, const int SHIFT){\
> +    int w= 2;\
> +    src -= 2*srcStride;\
> +    \
> +    while(w--){\
> +      __asm__ volatile(\
> +        "pxor %%mm7, %%mm7          \n\t"\
> +        "movd (%0), %%mm0           \n\t"\
> +        "add %2, %0                 \n\t"\
> +        "movd (%0), %%mm1           \n\t"\
> +        "add %2, %0                 \n\t"\
> +        "movd (%0), %%mm2           \n\t"\
> +        "add %2, %0                 \n\t"\
> +        "movd (%0), %%mm3           \n\t"\
> +        "add %2, %0                 \n\t"\
> +        "movd (%0), %%mm4           \n\t"\
> +        "add %2, %0                 \n\t"\
> +        "punpcklbw %%mm7, %%mm0     \n\t"\
> +        "punpcklbw %%mm7, %%mm1     \n\t"\
> +        "punpcklbw %%mm7, %%mm2     \n\t"\
> +        "punpcklbw %%mm7, %%mm3     \n\t"\
> +        "punpcklbw %%mm7, %%mm4     \n\t"\
> +        QPEL_RV40V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
> +        QPEL_RV40V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
> +        QPEL_RV40V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
> +        QPEL_RV40V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
> +        QPEL_RV40V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
> +        QPEL_RV40V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
> +        QPEL_RV40V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
> +        QPEL_RV40V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
> +         \
> +        : "+a"(src), "+c"(dst)\
> +        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),\
> +        "m"(ff_pw_5), "m"(*C1_reg), "m"(*C2_reg), "m"(*rnd_reg),  
> "g"((x86_reg)SHIFT)\
> +        : "memory"\
> +     );\
> +     if(h==16){\
> +        __asm__ volatile(\
> +            QPEL_RV40V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
> +            QPEL_RV40V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
> +            QPEL_RV40V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
> +            QPEL_RV40V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
> +            QPEL_RV40V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
> +            QPEL_RV40V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
> +            QPEL_RV40V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
> +            QPEL_RV40V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
> +            \
> +            : "+a"(src), "+c"(dst)\
> +            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),\
> +            "m"(ff_pw_5), "m"(*C1_reg), "m"(*C2_reg),  
> "m"(*rnd_reg), "g"((x86_reg)SHIFT)\
> +            : "memory"\

Just wanted to note that this breaks the OS X shared build (possibly  
linux x86_32 with -fPIC as well, but that seems to be already broken  
for me) since gcc fails to find enough registers. I'm trying to figure  
out a workaround, but it's not as simple as cavs since gcc wants to  
use three registers for C1_reg, C2_reg, and rnd_reg but leaves SHIFT  
on the stack... Although perhaps it should be under HAVE_7REGS since  
I'm not sure if it's possible to use fewer than 7 without inlining  
this function?

Alternatively as suggested in IRC I wouldn't mind using -mdynamic-no- 
pic by default for shared libs on OS X to simply avoid this whole mess  
(and other larger ones with gcc 4.2+); my only reason for messing with  
this is to keep "./configure --enable-shared" able to compile on OS X.  
x264 does this but has --enable-pic to disable using the flag.

-David