[FFmpeg-devel] [PATCH 4/4] avcodec/x86: avg_pixels16_x2_sse2

Sun Feb 3 16:31:09 CET 2013

about 1% faster bidirectional motion compensation for matrixbench
on i7

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
 libavcodec/x86/dsputil_mmx.c |    3 +++
 libavcodec/x86/hpeldsp.asm   |   37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1c796ae..9587cd4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1529,6 +1529,8 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                               int line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           int line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
 void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                               int line_size, int h);
 
@@ -2045,6 +2047,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 
             c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
             c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+            c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
             c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
         }
     }
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index a151834..45e0812 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -546,3 +546,40 @@ cglobal put_pixels16_x2, 4, 5, 4
     sub         r3d, 4
     jne .loop
     REP_RET
+
+; avg_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+cglobal avg_pixels16_x2, 4, 5, 4
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+.loop:
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    movu         m1, [r0]
+    movu         m3, [r0+r2]
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    add          r0, r4
+    add          r1, r4
+    movu         m1, [r0]
+    movu         m3, [r0+r2]
+    PAVGB        m0, m1
+    PAVGB        m2, m3
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
-- 
1.7.9.5