[FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()

Wu, Jianhua jianhua.wu at intel.com
Mon Sep 6 05:10:54 EEST 2021


Ping.

> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu at intel.com>
> Sent: Friday, August 27, 2021 12:52 PM
> To: ffmpeg-devel at ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu at intel.com>
> Subject: [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
> 
> Performance(Less is better):
> 8bit:
>     ff_hflip_byte_ssse3   0.61
>     ff_hflip_byte_avx2    0.37
>     ff_hflip_byte_avx512  0.19
> 16bit:
>     ff_hflip_short_ssse3  1.27
>     ff_hflip_short_avx2   0.76
>     ff_hflip_short_avx512 0.40
> 
> Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
> ---
>  libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
>  libavfilter/x86/vf_hflip_init.c |  8 ++++++++
>  2 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index
> 285618954f..c2237217f7 100644
> --- a/libavfilter/x86/vf_hflip.asm
> +++ b/libavfilter/x86/vf_hflip.asm
> @@ -26,12 +26,16 @@ SECTION_RODATA
> 
>  pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
>  pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
> 
>  SECTION .text
> 
>  ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)  %macro
> HFLIP 3  cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
> +%if mmsize == 64
> +    movu              m3, [pd_flip_indicies]
> +%endif
>      VBROADCASTI128    m0, [pb_flip_%1]
>      xor               xq, xq
>  %if %3 == 1
> @@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
> 
>      .loop0:
>          neg     xq
> -%if mmsize == 32
> -        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
> -        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
> +%if   mmsize == 64
> +        vpermd  m1, m3, [srcq + xq -     mmsize + %3]
> +        vpermd  m2, m3, [srcq + xq - 2 * mmsize + %3] %elif mmsize ==
> +32
> +        vpermq      m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
> +        vpermq      m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
>  %else
> -        movu    m1, [srcq + xq -     mmsize + %3]
> -        movu    m2, [srcq + xq - 2 * mmsize + %3]
> +        movu        m1, [srcq + xq -     mmsize + %3]
> +        movu        m2, [srcq + xq - 2 * mmsize + %3]
>  %endif
>          pshufb  m1, m0
>          pshufb  m2, m0
> @@ -88,3 +95,9 @@ INIT_YMM avx2
>  HFLIP byte, b, 1
>  HFLIP short, w, 2
>  %endif
> +
> +%if HAVE_AVX512_EXTERNAL
> +INIT_ZMM avx512
> +HFLIP byte, b, 1
> +HFLIP short, w, 2
> +%endif
> diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index
> 0ac399b0d4..25fc40f7b0 100644
> --- a/libavfilter/x86/vf_hflip_init.c
> +++ b/libavfilter/x86/vf_hflip_init.c
> @@ -25,8 +25,10 @@
> 
>  void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);  void
> ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
> +void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
>  void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);  void
> ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
> +void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
> 
>  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
> { @@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int
> step[4], int nb_planes)
>              if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>                  s->flip_line[i] = ff_hflip_byte_avx2;
>              }
> +            if (EXTERNAL_AVX512(cpu_flags)) {
> +                s->flip_line[i] = ff_hflip_byte_avx512;
> +            }
>          } else if (step[i] == 2) {
>              if (EXTERNAL_SSSE3(cpu_flags)) {
>                  s->flip_line[i] = ff_hflip_short_ssse3; @@ -48,6 +53,9 @@ av_cold
> void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
>              if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>                  s->flip_line[i] = ff_hflip_short_avx2;
>              }
> +            if (EXTERNAL_AVX512(cpu_flags)) {
> +                s->flip_line[i] = ff_hflip_short_avx512;
> +            }
>          }
>      }
>  }
> --
> 2.17.1



More information about the ffmpeg-devel mailing list