[FFmpeg-devel] [PATCH] MMX VP3 Loop Filter
Michael Niedermayer
michaelni
Sat Oct 4 20:18:52 CEST 2008
On Sat, Oct 04, 2008 at 12:21:42AM -0400, David Conrad wrote:
> Hi,
>
> This was adapted from libtheora and gives an overall speedup of about 5%.
>
> 685 dezicycles in ff_vp3_v_loop_filter_mmx, 8388422 runs, 186 skips
> 1334 dezicycles in ff_vp3_h_loop_filter_mmx, 8388231 runs, 377 skips
> 680 dezicycles in ff_vp3_v_loop_filter_mmx, 16776868 runs, 348 skips
> 1327 dezicycles in ff_vp3_h_loop_filter_mmx, 16776505 runs, 711 skips
> 688 dezicycles in ff_vp3_v_loop_filter_mmx, 33553784 runs, 648 skips
> 1350 dezicycles in ff_vp3_h_loop_filter_mmx, 33552962 runs, 1470 skips
>
> 1553 dezicycles in ff_vp3_v_loop_filter_c, 8388263 runs, 345 skips
> 1722 dezicycles in ff_vp3_h_loop_filter_c, 8388149 runs, 459 skips
> 1551 dezicycles in ff_vp3_v_loop_filter_c, 16776600 runs, 616 skips
> 1710 dezicycles in ff_vp3_h_loop_filter_c, 16776297 runs, 919 skips
> 1556 dezicycles in ff_vp3_v_loop_filter_c, 33553133 runs, 1299 skips
> 1737 dezicycles in ff_vp3_h_loop_filter_c, 33552271 runs, 2161 skips
>
> commit 826c801ddf0e1844b6c8970c63068a6af8181fdd
> Author: David Conrad <davedc at Kozue.local>
> Date: Wed Oct 1 18:18:53 2008 -0400
>
> MMX VP3 loop filter, adapted from libtheora
>
> diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
> index 6e1a93d..2f035d2 100644
> --- a/libavcodec/i386/dsputil_mmx.c
> +++ b/libavcodec/i386/dsputil_mmx.c
> @@ -2591,6 +2591,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
> c->h263_v_loop_filter= h263_v_loop_filter_mmx;
> c->h263_h_loop_filter= h263_h_loop_filter_mmx;
> }
> + if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
> + c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx;
> + c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx;
> + }
> c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
> c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
> c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
> diff --git a/libavcodec/i386/dsputil_mmx.h b/libavcodec/i386/dsputil_mmx.h
> index f095975..a4a9eab 100644
> --- a/libavcodec/i386/dsputil_mmx.h
> +++ b/libavcodec/i386/dsputil_mmx.h
> @@ -86,6 +86,20 @@ extern const double ff_pd_2[2];
> SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
> SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
>
> +#define TRANSPOSE8x4(a,b,c,d,e,f,g,h)\
> + "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
> + "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
> + "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
> + "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
> + SBUTTERFLY(a, b, e, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
> + /* e= a2 b2 e2 f2 a3 b3 e3 f3 */\
> + SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
> + /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
> + SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
> + /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
> + SBUTTERFLY(e, b, c, wd, q) /* e= a2 b2 c2 d2 e2 f2 g2 h2 */\
> + /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
> +
> #ifdef ARCH_X86_64
> // permutes 01234567 -> 05736421
> #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
> diff --git a/libavcodec/i386/vp3dsp_mmx.c b/libavcodec/i386/vp3dsp_mmx.c
> index 6304e91..83121d9 100644
> --- a/libavcodec/i386/vp3dsp_mmx.c
> +++ b/libavcodec/i386/vp3dsp_mmx.c
> @@ -21,13 +21,192 @@
> /**
> * @file vp3dsp_mmx.c
> * MMX-optimized functions cribbed from the original VP3 source code.
> + * MMX loop filter from libtheora
> */
>
> +#include "libavutil/x86_cpu.h"
> #include "libavcodec/dsputil.h"
> #include "dsputil_mmx.h"
>
> extern const uint16_t ff_vp3_idct_data[];
>
> +// in: p0 in mm7, p1 in mm4, p2 in mm2, p3 in mm1
> +// out: p1 in mm4, p2 in mm1
> +#define VP3_LOOP_FILTER(flim, pw3, pw4) \
> + "pxor %%mm0, %%mm0 \n\t" \
> + "movq %%mm7, %%mm6 \n\t" \
> + "punpcklbw %%mm0, %%mm6 \n\t" \
using the not written to case after a mov first is likely faster
> + "movq %%mm1, %%mm5 \n\t" \
> + "punpckhbw %%mm0, %%mm7 \n\t" \
> + "punpcklbw %%mm0, %%mm1 \n\t" \
> + "punpckhbw %%mm0, %%mm5 \n\t" \
> + "psubw %%mm1, %%mm6 \n\t" \
> + "psubw %%mm5, %%mm7 \n\t" /* mm7:mm6 = p0-p3 */ \
> + "movq %%mm4, %%mm5 \n\t" \
> + "movq %%mm2, %%mm3 \n\t" \
> + "movq %%mm2, %%mm1 \n\t" \
> + "punpckhbw %%mm0, %%mm5 \n\t" \
> + "punpcklbw %%mm0, %%mm4 \n\t" \
> + "punpckhbw %%mm0, %%mm3 \n\t" \
> + "punpcklbw %%mm0, %%mm2 \n\t" \
> + "psubw %%mm5, %%mm3 \n\t" \
> + "psubw %%mm4, %%mm2 \n\t" /* mm3:mm2 = p2-p1 */ \
> + "pmullw "#pw3", %%mm3 \n\t" \
> + "pmullw "#pw3", %%mm2 \n\t" \
> + "paddw %%mm7, %%mm3 \n\t" \
> + "paddw %%mm6, %%mm2 \n\t" /* mm3:mm2 = p0-p3 + 3*(p2-p1) */ \
> + "paddw "#pw4", %%mm3 \n\t" \
> + "paddw "#pw4", %%mm2 \n\t" \
> + "psraw $3, %%mm3 \n\t" \
> + "psraw $3, %%mm2 \n\t" \
> + "packuswb %%mm5, %%mm4 \n\t" /* mm4 = (p0-p3 + 3*(p2-p1) + 4) >> 3 */ \
?
the comment has nothing to do with the code
> +\
> + "movd "#flim", %%mm5 \n\t" \
> + "punpcklbw %%mm5, %%mm5 \n\t" \
> + "punpcklbw %%mm5, %%mm5 \n\t" \
> + "punpcklbw %%mm0, %%mm5 \n\t" /*mm5 = L L L L */ \
you can precalculate that i think
> +/*if(R_i<-2L||R_i>2L)R_i=0:*/ \
> + "movq %%mm2, %%mm0 \n\t" \
> + "pxor %%mm6, %%mm6 \n\t" \
mm0 was 0, dont overwrite it and zero another ...
> + "movq %%mm5, %%mm7 \n\t" \
> + "psubw %%mm5, %%mm6 \n\t" \
> + "psllw $1, %%mm7 \n\t" \
paddw
> + "psllw $1, %%mm6 \n\t" \
> +/*mm2==R_3 R_2 R_1 R_0*/ \
> +/*mm0==R_3 R_2 R_1 R_0*/ \
> +/*mm6==-2L -2L -2L -2L*/ \
> +/*mm7==2L 2L 2L 2L*/ \
> + "pcmpgtw %%mm2, %%mm7 \n\t" \
> + "pcmpgtw %%mm6, %%mm0 \n\t" \
> + "pand %%mm7, %%mm2 \n\t" \
> + "movq %%mm5, %%mm7 \n\t" \
> + "pand %%mm0, %%mm2 \n\t" \
> + "psllw $1, %%mm7 \n\t" \
> + "movq %%mm3, %%mm0 \n\t" \
> +/*mm3==R_7 R_6 R_5 R_4*/ \
> +/*mm0==R_7 R_6 R_5 R_4*/ \
> +/*mm6==-2L -2L -2L -2L*/ \
> +/*mm7==2L 2L 2L 2L*/ \
> + "pcmpgtw %%mm3, %%mm7 \n\t" \
> + "pcmpgtw %%mm6, %%mm0 \n\t" \
> + "pand %%mm7, %%mm3 \n\t" \
> + "movq %%mm5, %%mm7 \n\t" \
> + "pand %%mm0, %%mm3 \n\t" \
> +/*if(R_i<-L)R_i'=R_i+2L;*/ \
> +/*if(R_i>L)R_i'=R_i-2L;*/ \
> +/*if(R_i<-L||R_i>L)R_i=-R_i':*/ \
> + "psraw $1, %%mm6 \n\t" \
> + "movq %%mm2, %%mm0 \n\t" \
> + "psllw $1, %%mm7 \n\t" \
> +/*mm2==R_3 R_2 R_1 R_0*/ \
> +/*mm5==R_3 R_2 R_1 R_0*/ \
> +/*mm6==-L -L -L -L*/ \
> +/*mm5==L L L L*/ \
> + "pcmpgtw %%mm5, %%mm0 \n\t" /*mm0=R_i>L?FF:00*/ \
> + "pcmpgtw %%mm2, %%mm6 \n\t" /*mm6=-L>R_i?FF:00*/ \
> + "pand %%mm0, %%mm7 \n\t" /*mm7=R_i>L?2L:0*/ \
> + "psubw %%mm7, %%mm2 \n\t" /*mm2=R_i>L?R_i-2L:R_i*/ \
> + "movq %%mm5, %%mm7 \n\t" \
> + "por %%mm6, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L*/ \
> + "psllw $1, %%mm7 \n\t" \
> + "pand %%mm6, %%mm7 \n\t" /*mm7=-L>R_i?2L:0*/ \
> + "pxor %%mm6, %%mm6 \n\t" \
> + "paddw %%mm7, %%mm2 \n\t" /*mm2=-L>R_i?R_i+2L:R_i*/ \
> + "psubw %%mm5, %%mm6 \n\t" \
> + "pand %%mm2, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L?-R_i':0*/ \
> + "movq %%mm5, %%mm7 \n\t" \
> + "psubw %%mm0, %%mm2 \n\t" /*mm2=-L>R_i||R_i>L?0:R_i*/ \
> + "psllw $1, %%mm7 \n\t" \
> + "psubw %%mm0, %%mm2 \n\t" /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ \
> + "movq %%mm3, %%mm0 \n\t" \
> +/*mm3==R_7 R_6 R_5 R_4*/ \
> +/*mm5==R_7 R_6 R_5 R_4*/ \
> +/*mm6==-L -L -L -L*/ \
> +/*mm0==L L L L*/ \
> + "pcmpgtw %%mm3, %%mm6 \n\t" /*mm6=-L>R_i?FF:00*/ \
> + "pcmpgtw %%mm5, %%mm0 \n\t" /*mm0=R_i>L?FF:00*/ \
> + "pand %%mm0, %%mm7 \n\t" /*mm7=R_i>L?2L:0*/ \
> + "psubw %%mm7, %%mm3 \n\t" /*mm3=R_i>L?R_i-2L:R_i*/ \
> + "psllw $1, %%mm5 \n\t" \
> + "por %%mm6, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L*/ \
> + "pand %%mm6, %%mm5 \n\t" /*mm5=-L>R_i?2L:0*/ \
> + "paddw %%mm5, %%mm3 \n\t" /*mm3=-L>R_i?R_i+2L:R_i*/ \
> + "pand %%mm3, %%mm0 \n\t" /*mm0=-L>R_i||R_i>L?-R_i':0*/ \
> + "psubw %%mm0, %%mm3 \n\t" /*mm3=-L>R_i||R_i>L?0:R_i*/ \
> + "psubw %%mm0, %%mm3 \n\t" /*mm3=-L>R_i||R_i>L?-R_i':R_i*/ \
> +/* We need u8+s8 with unsigned saturation, so promote to 16 bits */ \
> + "pxor %%mm5, %%mm5 \n\t" \
> + "movq %%mm4, %%mm0 \n\t" \
> + "punpcklbw %%mm5, %%mm4 \n\t" \
> + "punpckhbw %%mm5, %%mm0 \n\t" \
> + "movq %%mm1, %%mm6 \n\t" \
> + "punpcklbw %%mm5, %%mm1 \n\t" \
> + "punpckhbw %%mm5, %%mm6 \n\t" \
arent these unpacks duplicated?
anyway, i think the code is pretty poor quality, i suspect the same can be
done with half as many instructions or less
something like
packuswb %%mm3, %%mm2 (the zero point is assumed to be at 128 not 0)
pminub CONST0, mm2 "-9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 4 4 4 4 4"
pmaxub CONST1, mm2 "-4 -4 -4 -4 -4 -4 -3 -2 -1 0 1 2 3 4 4 4 4 4 4"
movq %%mm2, %%mm3 "-4 -4 -4 -4 -4 -4 -3 -2 -1 0 1 2 3 4 4 4 4 4 4"
paddw %%mm2, %%mm2 "-8 -8 -8 -8 -8 -8 -6 -4 -2 0 2 4 6 8 8 8 8 8 8"
pminub CONST0, mm2 "-8 -8 -8 -8 -8 -8 -6 -4 -2 0 2 4 4 4 4 4 4 4 4"
pmaxub CONST1, mm2 "-4 -4 -4 -4 -4 -4 -4 -4 -2 0 2 4 4 4 4 4 4 4 4"
psubb %%mm2, %%mm3 "-0 -0 -0 -0 -0 -0 +1 +2 +1 0-1-2-1 0 0 0 0 0 0"
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Let us carefully observe those good qualities wherein our enemies excel us
and endeavor to excel them, by avoiding what is faulty, and imitating what
is excellent in them. -- Plutarch
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20081004/82f254d0/attachment.pgp>
More information about the ffmpeg-devel
mailing list