[FFmpeg-devel] [PATCH 4/4] avcodec/x86: avg_pixels16_x2_sse2
Michael Niedermayer
michaelni at gmx.at
Sun Feb 3 16:31:09 CET 2013
about 1% faster bidirectional motion compensation for matrixbench
on i7
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
---
libavcodec/x86/dsputil_mmx.c | 3 +++
libavcodec/x86/hpeldsp.asm | 37 +++++++++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1c796ae..9587cd4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1529,6 +1529,8 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ int line_size, int h);
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
@@ -2045,6 +2047,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
}
}
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index a151834..45e0812 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -546,3 +546,40 @@ cglobal put_pixels16_x2, 4, 5, 4
sub r3d, 4
jne .loop
REP_RET
+
+; avg_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+cglobal avg_pixels16_x2, 4, 5, 4
+ movsxdifnidn r2, r2d
+ lea r4, [r2*2]
+.loop:
+ movu m0, [r1]
+ movu m2, [r1+r2]
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ PAVGB m0, m1
+ PAVGB m2, m3
+ movu m1, [r0]
+ movu m3, [r0+r2]
+ PAVGB m0, m1
+ PAVGB m2, m3
+ add r1, r4
+ mova [r0], m0
+ mova [r0+r2], m2
+ movu m0, [r1]
+ movu m2, [r1+r2]
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ PAVGB m0, m1
+ PAVGB m2, m3
+ add r0, r4
+ add r1, r4
+ movu m1, [r0]
+ movu m3, [r0+r2]
+ PAVGB m0, m1
+ PAVGB m2, m3
+ mova [r0], m0
+ mova [r0+r2], m2
+ add r0, r4
+ sub r3d, 4
+ jne .loop
+ REP_RET
--
1.7.9.5
More information about the ffmpeg-devel
mailing list