[FFmpeg-devel] [PATCH 1/4] avcodec/x86: add put_pixels16_x2_sse2
Michael Niedermayer
michaelni at gmx.at
Sun Feb 3 16:31:06 CET 2013
about 1% faster P frame motion compensation for matrixbench on i7
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
libavcodec/x86/dsputil_mmx.c | 4 ++++
libavcodec/x86/hpeldsp.asm | 31 ++++++++++++++++++++++++++++++-
2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 2e8300a..29d87a1 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1523,6 +1523,8 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
@@ -2034,6 +2036,8 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
// these functions are slower than mmx on AMD, but faster on Intel
if (!high_bit_depth) {
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
+
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
}
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 7f0c285..81b6901 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -2,7 +2,7 @@
;*
;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice at bellard.org>
;* Copyright (c) Nick Kurshev <nickols_k at mail.ru>
-;* Copyright (c) 2002 Michael Niedermayer <michaelni at gmx.at>
+;* Copyright (c) 2002-2013 Michael Niedermayer <michaelni at gmx.at>
;* Copyright (c) 2002 Zdenek Kabelac <kabi at informatics.muni.cz>
;* Copyright (c) 2013 Daniel Kang
;*
@@ -513,3 +513,32 @@ cglobal avg_pixels16, 4,5,4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
+
+; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+cglobal put_pixels16_x2, 4, 5, 4
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+.loop:
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ movu m2, [r1+1]
+ movu m3, [r1+r2+1]
+ pavgb m0, m2
+ pavgb m1, m3
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r1, r4
+ add r0, r4
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ movu m2, [r1+1]
+ movu m3, [r1+r2+1]
+ pavgb m0, m2
+ pavgb m1, m3
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m1
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
--
1.7.9.5
More information about the ffmpeg-devel
mailing list