[FFmpeg-cvslog] h264pred: added AVX2 implementation for tm_vp8 16x16.
Mirage Abeysekara
git at videolan.org
Mon Mar 20 15:53:25 EET 2017
ffmpeg | branch: master | Mirage Abeysekara <mirage.12 at cse.mrt.ac.lk> | Sun Mar 19 01:38:53 2017 +0530| [5eb4f95bef2f95c4e776c60613b2825064d02ba9] | committer: Ronald S. Bultje
h264pred: added AVX2 implementation for tm_vp8 16x16.
checkasm --bench results with 5000 runs
pred16x16_tm_vp8_c: 302.8
pred16x16_tm_vp8_mmx: 101.4
pred16x16_tm_vp8_mmxext: 95.5
pred16x16_tm_vp8_sse2: 95.1
pred16x16_tm_vp8_avx2: 38.2
Signed-off-by: Ronald S. Bultje <rsbultje at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5eb4f95bef2f95c4e776c60613b2825064d02ba9
---
libavcodec/x86/h264_intrapred.asm | 37 ++++++++++++++++++++++++++++++++++++
libavcodec/x86/h264_intrapred_init.c | 7 +++++++
2 files changed, 44 insertions(+)
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index c88d91b..0f3b462 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -268,6 +268,43 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
jg .loop
REP_RET
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+ sub dstq, strideq
+ pmovzxbw m0, [dstq]
+ vpbroadcastb xm1, [r0-1]
+ pmovzxbw m1, xm1
+ psubw m0, m1
+ mov iterationd, 4
+ lea stride3q, [strideq*3]
+.loop:
+ vpbroadcastb xm1, [dstq+strideq*1-1]
+ vpbroadcastb xm2, [dstq+strideq*2-1]
+ vpbroadcastb xm3, [dstq+stride3q-1]
+ vpbroadcastb xm4, [dstq+strideq*4-1]
+ pmovzxbw m1, xm1
+ pmovzxbw m2, xm2
+ pmovzxbw m3, xm3
+ pmovzxbw m4, xm4
+ paddw m1, m0
+ paddw m2, m0
+ paddw m3, m0
+ paddw m4, m0
+ vpackuswb m1, m1, m2
+ vpackuswb m3, m3, m4
+ vpermq m1, m1, q3120
+ vpermq m3, m3, q3120
+ movdqa [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m1, 1
+ movdqa [dstq+stride3q*1], xm3
+ vextracti128 [dstq+strideq*4], m3, 1
+ lea dstq, [dstq+strideq*4]
+ dec iterationd
+ jg .loop
+ REP_RET
+%endif
+
;-----------------------------------------------------------------------------
; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 528b92e..bdd5125 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
+PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
@@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
}
}
}
+
+ if(EXTERNAL_AVX2(cpu_flags)){
+ if (codec_id == AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
+ }
+ }
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
More information about the ffmpeg-cvslog
mailing list