[FFmpeg-devel] [PATCH 1/4] avcodec/x86: add put_pixels16_x2_sse2
Ronald S. Bultje
rsbultje at gmail.com
Sun Feb 3 20:30:56 CET 2013
Hi,
On Sun, Feb 3, 2013 at 7:31 AM, Michael Niedermayer <michaelni at gmx.at> wrote:
> about 1% faster P frame motion compensation for matrixbench on i7
>
> Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> ---
> libavcodec/x86/dsputil_mmx.c | 4 ++++
> libavcodec/x86/hpeldsp.asm | 31 ++++++++++++++++++++++++++++++-
> 2 files changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 2e8300a..29d87a1 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -1523,6 +1523,8 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
>
> void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
> int line_size, int h);
> +void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
> + int line_size, int h);
> void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
> int line_size, int h);
>
> @@ -2034,6 +2036,8 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
> // these functions are slower than mmx on AMD, but faster on Intel
> if (!high_bit_depth) {
> c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
> + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
> +
> c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
> c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
> }
> diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
> index 7f0c285..81b6901 100644
> --- a/libavcodec/x86/hpeldsp.asm
> +++ b/libavcodec/x86/hpeldsp.asm
> @@ -2,7 +2,7 @@
> ;*
> ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice at bellard.org>
> ;* Copyright (c) Nick Kurshev <nickols_k at mail.ru>
> -;* Copyright (c) 2002 Michael Niedermayer <michaelni at gmx.at>
> +;* Copyright (c) 2002-2013 Michael Niedermayer <michaelni at gmx.at>
> ;* Copyright (c) 2002 Zdenek Kabelac <kabi at informatics.muni.cz>
> ;* Copyright (c) 2013 Daniel Kang
> ;*
> @@ -513,3 +513,32 @@ cglobal avg_pixels16, 4,5,4
> lea r0, [r0+r2*4]
> jnz .loop
> REP_RET
> +
> +; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
> +cglobal put_pixels16_x2, 4, 5, 4
> + movsxdifnidn r2, r2d
> + lea r4, [r2*2]
> +.loop:
> + movu m0, [r1]
> + movu m1, [r1+r2]
> + movu m2, [r1+1]
> + movu m3, [r1+r2+1]
> + pavgb m0, m2
> + pavgb m1, m3
> + mova [r0], m0
> + mova [r0+r2], m1
> + add r1, r4
> + add r0, r4
> + movu m0, [r1]
> + movu m1, [r1+r2]
> + movu m2, [r1+1]
> + movu m3, [r1+r2+1]
> + pavgb m0, m2
> + pavgb m1, m3
> + add r1, r4
> + mova [r0], m0
> + mova [r0+r2], m1
> + add r0, r4
> + sub r3d, 4
> + jne .loop
> + REP_RET
I bet that this code is identical to the 8-pixel mmx/mmx2 version. How
about you extend that version to use %+ mmsize in the cglobal line,
and then INIT_MMX mmx callmacro INIT_XMM sse2 callmacro so you use the
same macro for both versions = smaller and more maintainable source
code size?
Ronald
More information about the ffmpeg-devel
mailing list