[FFmpeg-devel] [PATCH 5/5] avcodec/h264: add avx 8-bit h264_idct_dc_add

James Almer jamrial at gmail.com
Wed Apr 5 07:05:08 EEST 2017


On 4/4/2017 10:53 PM, James Darnley wrote:
> Haswell:
>  - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext
> 
> Skylake-U:
>  - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
> ---
>  libavcodec/x86/h264_idct.asm  | 20 ++++++++++++++++++++
>  libavcodec/x86/h264dsp_init.c |  2 ++
>  2 files changed, 22 insertions(+)
> 
> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
> index 24fb4d2..7fd57d3 100644
> --- a/libavcodec/x86/h264_idct.asm
> +++ b/libavcodec/x86/h264_idct.asm
> @@ -1158,7 +1158,27 @@ INIT_XMM avx
>      movd  [%7+%8], %4
>  %endmacro
>  
> +%macro DC_ADD_INIT 1
> +    add      %1d, 32
> +    sar      %1d, 6
> +    movd     m0, %1d
> +    SPLATW   m0, m0, 0

Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be
enough. This macro calls two instructions to fill the entire XMM register,
and there's no need for that.
You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining
said dwords with punpk* into fewer registers to reduce the amount of padd*
and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm

And again, SSE2 first, AVX only if measurably faster. But since you're not
making use of the wider XMM regs here at all, the only chips that will see
any real speed up are those slow in mmx (like Skylake seems to be).

> +    lea      %1, [3*stride_q]
> +    pxor     m1, m1
> +    psubw    m1, m0
> +    packuswb m0, m0
> +    packuswb m1, m1
> +%endmacro
> +
>  cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
>      movsxdifnidn stride_q, stride_d
>      IDCT4_ADD    dst_q, block_q, stride_q
>  RET
> +
> +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_
> +    movsxdifnidn stride_q, stride_d
> +    movsx             r3d, word [block_q]
> +    mov   dword [block_q], 0
> +    DC_ADD_INIT r3
> +    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
> +RET
> diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
> index 8ba085f..bf74937 100644
> --- a/libavcodec/x86/h264dsp_init.c
> +++ b/libavcodec/x86/h264dsp_init.c
> @@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx)
>  IDCT_ADD_FUNC(, 8, avx)
>  IDCT_ADD_FUNC(, 10, sse2)
>  IDCT_ADD_FUNC(_dc, 8, mmxext)
> +IDCT_ADD_FUNC(_dc, 8, avx)
>  IDCT_ADD_FUNC(_dc, 10, mmxext)
>  IDCT_ADD_FUNC(8_dc, 8, mmxext)
>  IDCT_ADD_FUNC(8_dc, 10, sse2)
> @@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
>              }
>  
>              c->h264_idct_add        = ff_h264_idct_add_8_avx;
> +            c->h264_idct_dc_add     = ff_h264_idct_dc_add_8_avx;
>          }
>      } else if (bit_depth == 10) {
>          if (EXTERNAL_MMXEXT(cpu_flags)) {
> 



More information about the ffmpeg-devel mailing list