[FFmpeg-cvslog] avcodec/h264: mmx2, sse2, avx 10-bit 4:2:2 h chroma deblock/loop filter
James Darnley
git at videolan.org
Wed Dec 7 01:33:37 EET 2016
ffmpeg | branch: master | James Darnley <jdarnley at obe.tv> | Thu Dec 1 03:15:10 2016 +0100| [728651df06f48b49228ac5b8241b1ebe8d5b9d73] | committer: James Darnley
avcodec/h264: mmx2, sse2, avx 10-bit 4:2:2 h chroma deblock/loop filter
Yorkfield:
- mmx2: 2.53x (504 vs. 199 cycles)
- sse2: 3.83x (504 vs. 131 cycles)
Nehalem:
- mmx2: 2.42x (365 vs. 151 cycles)
- sse2: 3.56x (365 vs. 103 cycles)
Skylake:
- mmx2: 1.81x (308 vs. 170 cycles)
- sse2: 2.84x (308 vs. 108 cycles)
- avx: 2.93x (308 vs. 105 cycles)
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=728651df06f48b49228ac5b8241b1ebe8d5b9d73
---
libavcodec/x86/h264_deblock_10bit.asm | 39 +++++++++++++++++++++++++++++++++++
libavcodec/x86/h264dsp_init.c | 6 ++++++
2 files changed, 45 insertions(+)
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index 3536e41..56cf4d6 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -1032,6 +1032,45 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc
%endif
RET
+;-----------------------------------------------------------------------------
+; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
+; int8_t *tc0)
+;-----------------------------------------------------------------------------
+cglobal deblock_h_chroma422_10, 5, 7, 8, 3*mmsize, pix_, stride_, alpha_, beta_, tc0_
+ shl alpha_d, 2
+ shl beta_d, 2
+
+ movd m0, [tc0_q]
+ punpcklbw m0, m0
+ psraw m0, 6
+ movq [rsp], m0
+
+ mov r5, pix_q
+ lea r6, [3*stride_q]
+ add r5, r6
+
+ mov r4, -8
+ .loop:
+
+ CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+ LOAD_AB m4, m5, alpha_d, beta_d
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ movd m6, [rsp + r4 + 8]
+ punpcklwd m6, m6
+ punpcklwd m6, m6
+ psubw m6, [pw_3]
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
+
+ lea pix_q, [pix_q + (mmsize/2)*stride_q]
+ lea r5, [r5 + (mmsize/2)*stride_q]
+ add r4, (mmsize/4)
+ jl .loop
+RET
+
%endmacro
%if ARCH_X86_64 == 0
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index ab270da..7b3d17f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -315,6 +315,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
}
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
@@ -351,6 +353,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
@@ -389,6 +393,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
}
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
More information about the ffmpeg-cvslog
mailing list