[FFmpeg-devel] [PATCH 6/6] avcodec/h264: add sse2 versions of previous idct functions

James Darnley jdarnley at obe.tv
Sat Apr 15 04:46:18 EEST 2017


Kaby Lake Pentium:
 - ff_h264_idct_add_8_sse2:    ~1.18x faster than mmxext
 - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxext
---
 libavcodec/x86/h264_idct.asm  | 11 +++++++++--
 libavcodec/x86/h264dsp_init.c |  5 +++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 43f7791..5d83d91 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1140,8 +1140,6 @@ IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
 
-INIT_XMM avx
-
 ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
 %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
     movd       %3, [%7]
@@ -1170,6 +1168,10 @@ INIT_XMM avx
     packuswb m1, m1
 %endmacro
 
+%macro IDCT_XMM 1
+
+INIT_XMM %1
+
 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
     movsxdifnidn stride_q, stride_d
     IDCT4_ADD    dst_q, block_q, stride_q
@@ -1182,3 +1184,8 @@ cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
     DC_ADD_INIT r3
     DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
 RET
+
+%endmacro
+
+IDCT_XMM sse2
+IDCT_XMM avx
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index bf74937..ce7179f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,9 +32,11 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
                                                        int stride);
 
 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, sse2)
 IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, sse2)
 IDCT_ADD_FUNC(_dc, 8, avx)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
 IDCT_ADD_FUNC(8_dc, 8, mmxext)
@@ -316,6 +318,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_sse2;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
             }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_sse2;
+            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags)) {
             c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
-- 
2.8.3



More information about the ffmpeg-devel mailing list