[FFmpeg-devel] [PATCH] MMX VP3 Loop Filter

Sat Oct 4 20:18:52 CEST 2008

On Sat, Oct 04, 2008 at 12:21:42AM -0400, David Conrad wrote:
> Hi,
>
> This was adapted from libtheora and gives an overall speedup of about 5%.
>
> 685 dezicycles in ff_vp3_v_loop_filter_mmx, 8388422 runs, 186 skips
> 1334 dezicycles in ff_vp3_h_loop_filter_mmx, 8388231 runs, 377 skips
> 680 dezicycles in ff_vp3_v_loop_filter_mmx, 16776868 runs, 348 skips
> 1327 dezicycles in ff_vp3_h_loop_filter_mmx, 16776505 runs, 711 skips
> 688 dezicycles in ff_vp3_v_loop_filter_mmx, 33553784 runs, 648 skips
> 1350 dezicycles in ff_vp3_h_loop_filter_mmx, 33552962 runs, 1470 skips
>
> 1553 dezicycles in ff_vp3_v_loop_filter_c, 8388263 runs, 345 skips
> 1722 dezicycles in ff_vp3_h_loop_filter_c, 8388149 runs, 459 skips
> 1551 dezicycles in ff_vp3_v_loop_filter_c, 16776600 runs, 616 skips
> 1710 dezicycles in ff_vp3_h_loop_filter_c, 16776297 runs, 919 skips
> 1556 dezicycles in ff_vp3_v_loop_filter_c, 33553133 runs, 1299 skips
> 1737 dezicycles in ff_vp3_h_loop_filter_c, 33552271 runs, 2161 skips
>

> commit 826c801ddf0e1844b6c8970c63068a6af8181fdd
> Author: David Conrad <davedc at Kozue.local>
> Date:   Wed Oct 1 18:18:53 2008 -0400
> 
>     MMX VP3 loop filter, adapted from libtheora
> 
> diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
> index 6e1a93d..2f035d2 100644
> --- a/libavcodec/i386/dsputil_mmx.c
> +++ b/libavcodec/i386/dsputil_mmx.c
> @@ -2591,6 +2591,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
>              c->h263_v_loop_filter= h263_v_loop_filter_mmx;
>              c->h263_h_loop_filter= h263_h_loop_filter_mmx;
>          }
> +        if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
> +            c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx;
> +            c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx;
> +        }
>          c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
>          c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
>          c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
> diff --git a/libavcodec/i386/dsputil_mmx.h b/libavcodec/i386/dsputil_mmx.h
> index f095975..a4a9eab 100644
> --- a/libavcodec/i386/dsputil_mmx.h
> +++ b/libavcodec/i386/dsputil_mmx.h
> @@ -86,6 +86,20 @@ extern const double ff_pd_2[2];
>      SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
>      SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
>  
> +#define TRANSPOSE8x4(a,b,c,d,e,f,g,h)\
> +    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
> +    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
> +    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
> +    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
> +    SBUTTERFLY(a, b, e, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
> +                                 /* e= a2 b2 e2 f2 a3 b3 e3 f3 */\
> +    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
> +                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
> +    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
> +                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
> +    SBUTTERFLY(e, b, c, wd, q)   /* e= a2 b2 c2 d2 e2 f2 g2 h2 */\
> +                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
> +
>  #ifdef ARCH_X86_64
>  // permutes 01234567 -> 05736421
>  #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
> diff --git a/libavcodec/i386/vp3dsp_mmx.c b/libavcodec/i386/vp3dsp_mmx.c
> index 6304e91..83121d9 100644
> --- a/libavcodec/i386/vp3dsp_mmx.c
> +++ b/libavcodec/i386/vp3dsp_mmx.c
> @@ -21,13 +21,192 @@
>  /**
>   * @file vp3dsp_mmx.c
>   * MMX-optimized functions cribbed from the original VP3 source code.
> + * MMX loop filter from libtheora
>   */
>  
> +#include "libavutil/x86_cpu.h"
>  #include "libavcodec/dsputil.h"
>  #include "dsputil_mmx.h"
>  
>  extern const uint16_t ff_vp3_idct_data[];
>  
> +// in:  p0 in mm7, p1 in mm4, p2 in mm2, p3 in mm1
> +// out: p1 in mm4, p2 in mm1
> +#define VP3_LOOP_FILTER(flim, pw3, pw4) \
> +    "pxor       %%mm0, %%mm0 \n\t" \

> +    "movq       %%mm7, %%mm6 \n\t" \
> +    "punpcklbw  %%mm0, %%mm6 \n\t" \

using the not written to case after a mov first is likely faster

> +    "movq       %%mm1, %%mm5 \n\t" \
> +    "punpckhbw  %%mm0, %%mm7 \n\t" \
> +    "punpcklbw  %%mm0, %%mm1 \n\t" \
> +    "punpckhbw  %%mm0, %%mm5 \n\t" \
> +    "psubw      %%mm1, %%mm6 \n\t" \
> +    "psubw      %%mm5, %%mm7 \n\t" /* mm7:mm6 = p0-p3 */ \
> +    "movq       %%mm4, %%mm5 \n\t" \
> +    "movq       %%mm2, %%mm3 \n\t" \
> +    "movq       %%mm2, %%mm1 \n\t" \
> +    "punpckhbw  %%mm0, %%mm5 \n\t" \
> +    "punpcklbw  %%mm0, %%mm4 \n\t" \
> +    "punpckhbw  %%mm0, %%mm3 \n\t" \
> +    "punpcklbw  %%mm0, %%mm2 \n\t" \
> +    "psubw      %%mm5, %%mm3 \n\t" \
> +    "psubw      %%mm4, %%mm2 \n\t" /* mm3:mm2 = p2-p1 */ \
> +    "pmullw    "#pw3", %%mm3 \n\t" \
> +    "pmullw    "#pw3", %%mm2 \n\t" \
> +    "paddw      %%mm7, %%mm3 \n\t" \
> +    "paddw      %%mm6, %%mm2 \n\t" /* mm3:mm2 = p0-p3 + 3*(p2-p1) */ \
> +    "paddw     "#pw4", %%mm3 \n\t" \
> +    "paddw     "#pw4", %%mm2 \n\t" \
> +    "psraw         $3, %%mm3 \n\t" \
> +    "psraw         $3, %%mm2 \n\t" \

> +    "packuswb   %%mm5, %%mm4 \n\t" /* mm4 = (p0-p3 + 3*(p2-p1) + 4) >> 3 */ \

?
the comment has nothing to do with the code

> +\
> +    "movd     "#flim", %%mm5 \n\t" \
> +    "punpcklbw  %%mm5, %%mm5 \n\t" \
> +    "punpcklbw  %%mm5, %%mm5 \n\t" \
> +    "punpcklbw  %%mm0, %%mm5 \n\t" /*mm5 = L L L L */ \

you can precalculate that i think

> +/*if(R_i<-2L||R_i>2L)R_i=0:*/ \
> +    "movq       %%mm2, %%mm0 \n\t" \
> +    "pxor       %%mm6, %%mm6 \n\t" \

mm0 was 0, dont overwrite it and zero another ...

> +    "movq       %%mm5, %%mm7 \n\t" \
> +    "psubw      %%mm5, %%mm6 \n\t" \

> +    "psllw         $1, %%mm7 \n\t" \

paddw

> +    "psllw         $1, %%mm6 \n\t" \
> +/*mm2==R_3 R_2 R_1 R_0*/ \
> +/*mm0==R_3 R_2 R_1 R_0*/ \
> +/*mm6==-2L -2L -2L -2L*/ \
> +/*mm7==2L 2L 2L 2L*/ \
> +    "pcmpgtw    %%mm2, %%mm7 \n\t" \
> +    "pcmpgtw    %%mm6, %%mm0 \n\t" \
> +    "pand       %%mm7, %%mm2 \n\t" \
> +    "movq       %%mm5, %%mm7 \n\t" \
> +    "pand       %%mm0, %%mm2 \n\t" \
> +    "psllw         $1, %%mm7 \n\t" \
> +    "movq       %%mm3, %%mm0 \n\t" \
> +/*mm3==R_7 R_6 R_5 R_4*/ \
> +/*mm0==R_7 R_6 R_5 R_4*/ \
> +/*mm6==-2L -2L -2L -2L*/ \
> +/*mm7==2L 2L 2L 2L*/ \
> +    "pcmpgtw    %%mm3, %%mm7 \n\t" \
> +    "pcmpgtw    %%mm6, %%mm0 \n\t" \
> +    "pand       %%mm7, %%mm3 \n\t" \
> +    "movq       %%mm5, %%mm7 \n\t" \
> +    "pand       %%mm0, %%mm3 \n\t" \
> +/*if(R_i<-L)R_i'=R_i+2L;*/ \
> +/*if(R_i>L)R_i'=R_i-2L;*/ \
> +/*if(R_i<-L||R_i>L)R_i=-R_i':*/ \
> +    "psraw         $1, %%mm6 \n\t" \
> +    "movq       %%mm2, %%mm0 \n\t" \
> +    "psllw         $1, %%mm7 \n\t" \
> +/*mm2==R_3 R_2 R_1 R_0*/ \
> +/*mm5==R_3 R_2 R_1 R_0*/ \
> +/*mm6==-L -L -L -L*/ \
> +/*mm5==L L L L*/ \
> +    "pcmpgtw    %%mm5, %%mm0 \n\t" /*mm0=R_i>L?FF:00*/ \
> +    "pcmpgtw    %%mm2, %%mm6 \n\t" /*mm6=-L>R_i?FF:00*/ \
> +    "pand       %%mm0, %%mm7 \n\t" /*mm7=R_i>L?2L:0*/ \
> +    "psubw      %%mm7, %%mm2 \n\t" /*mm2=R_i>L?R_i-2L:R_i*/ \
> +    "movq       %%mm5, %%mm7 \n\t" \
> +    "por        %%mm6, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L*/ \
> +    "psllw         $1, %%mm7 \n\t" \
> +    "pand       %%mm6, %%mm7 \n\t" /*mm7=-L>R_i?2L:0*/ \
> +    "pxor       %%mm6, %%mm6 \n\t" \
> +    "paddw      %%mm7, %%mm2 \n\t" /*mm2=-L>R_i?R_i+2L:R_i*/ \
> +    "psubw      %%mm5, %%mm6 \n\t" \
> +    "pand       %%mm2, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L?-R_i':0*/ \
> +    "movq       %%mm5, %%mm7 \n\t" \
> +    "psubw      %%mm0, %%mm2 \n\t" /*mm2=-L>R_i||R_i>L?0:R_i*/ \
> +    "psllw         $1, %%mm7 \n\t" \
> +    "psubw      %%mm0, %%mm2 \n\t" /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ \
> +    "movq       %%mm3, %%mm0 \n\t" \
> +/*mm3==R_7 R_6 R_5 R_4*/ \
> +/*mm5==R_7 R_6 R_5 R_4*/ \
> +/*mm6==-L -L -L -L*/ \
> +/*mm0==L L L L*/ \
> +    "pcmpgtw    %%mm3, %%mm6 \n\t" /*mm6=-L>R_i?FF:00*/ \
> +    "pcmpgtw    %%mm5, %%mm0 \n\t" /*mm0=R_i>L?FF:00*/ \
> +    "pand       %%mm0, %%mm7 \n\t" /*mm7=R_i>L?2L:0*/ \
> +    "psubw      %%mm7, %%mm3 \n\t" /*mm3=R_i>L?R_i-2L:R_i*/ \
> +    "psllw         $1, %%mm5 \n\t" \
> +    "por        %%mm6, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L*/ \
> +    "pand       %%mm6, %%mm5 \n\t" /*mm5=-L>R_i?2L:0*/ \
> +    "paddw      %%mm5, %%mm3 \n\t" /*mm3=-L>R_i?R_i+2L:R_i*/ \
> +    "pand       %%mm3, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L?-R_i':0*/ \
> +    "psubw      %%mm0, %%mm3 \n\t" /*mm3=-L>R_i||R_i>L?0:R_i*/ \
> +    "psubw      %%mm0, %%mm3 \n\t" /*mm3=-L>R_i||R_i>L?-R_i':R_i*/ \
> +/* We need u8+s8 with unsigned saturation, so promote to 16 bits */ \
> +    "pxor       %%mm5, %%mm5 \n\t" \

> +    "movq       %%mm4, %%mm0 \n\t" \

> +    "punpcklbw  %%mm5, %%mm4 \n\t" \
> +    "punpckhbw  %%mm5, %%mm0 \n\t" \
> +    "movq       %%mm1, %%mm6 \n\t" \
> +    "punpcklbw  %%mm5, %%mm1 \n\t" \
> +    "punpckhbw  %%mm5, %%mm6 \n\t" \

arent these unpacks duplicated?

anyway, i think the code is pretty poor quality, i suspect the same can be
done with half as many instructions or less
something like
packuswb    %%mm3, %%mm2 (the zero point is assumed to be at 128 not 0)
pminub CONST0, mm2   "-9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 4 4 4 4 4"
pmaxub CONST1, mm2   "-4 -4 -4 -4 -4 -4 -3 -2 -1 0 1 2 3 4 4 4 4 4 4"
movq %%mm2, %%mm3    "-4 -4 -4 -4 -4 -4 -3 -2 -1 0 1 2 3 4 4 4 4 4 4"
paddw %%mm2, %%mm2   "-8 -8 -8 -8 -8 -8 -6 -4 -2 0 2 4 6 8 8 8 8 8 8"
pminub CONST0, mm2   "-8 -8 -8 -8 -8 -8 -6 -4 -2 0 2 4 4 4 4 4 4 4 4"
pmaxub CONST1, mm2   "-4 -4 -4 -4 -4 -4 -4 -4 -2 0 2 4 4 4 4 4 4 4 4"
psubb %%mm2, %%mm3   "-0 -0 -0 -0 -0 -0 +1 +2 +1 0-1-2-1 0 0 0 0 0 0"

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Let us carefully observe those good qualities wherein our enemies excel us
and endeavor to excel them, by avoiding what is faulty, and imitating what
is excellent in them. -- Plutarch
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20081004/82f254d0/attachment.pgp>