[FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()

Wu Jianhua jianhua.wu at intel.com
Fri Aug 27 07:51:41 EEST 2021


Performance(Less is better):
8bit:
    ff_hflip_byte_ssse3   0.61
    ff_hflip_byte_avx2    0.37
    ff_hflip_byte_avx512  0.19
16bit:
    ff_hflip_short_ssse3  1.27
    ff_hflip_short_avx2   0.76
    ff_hflip_short_avx512 0.40

Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
---
 libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
 libavfilter/x86/vf_hflip_init.c |  8 ++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 285618954f..c2237217f7 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -26,12 +26,16 @@ SECTION_RODATA
 
 pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
 
 SECTION .text
 
 ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
 %macro HFLIP 3
 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
+%if mmsize == 64
+    movu              m3, [pd_flip_indicies]
+%endif
     VBROADCASTI128    m0, [pb_flip_%1]
     xor               xq, xq
 %if %3 == 1
@@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
 
     .loop0:
         neg     xq
-%if mmsize == 32
-        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
-        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%if   mmsize == 64
+        vpermd  m1, m3, [srcq + xq -     mmsize + %3]
+        vpermd  m2, m3, [srcq + xq - 2 * mmsize + %3]
+%elif mmsize == 32
+        vpermq      m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
+        vpermq      m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
 %else
-        movu    m1, [srcq + xq -     mmsize + %3]
-        movu    m2, [srcq + xq - 2 * mmsize + %3]
+        movu        m1, [srcq + xq -     mmsize + %3]
+        movu        m2, [srcq + xq - 2 * mmsize + %3]
 %endif
         pshufb  m1, m0
         pshufb  m2, m0
@@ -88,3 +95,9 @@ INIT_YMM avx2
 HFLIP byte, b, 1
 HFLIP short, w, 2
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 0ac399b0d4..25fc40f7b0 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -25,8 +25,10 @@
 
 void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
 
 av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
 {
@@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_byte_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_byte_avx512;
+            }
         } else if (step[i] == 2) {
             if (EXTERNAL_SSSE3(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_ssse3;
@@ -48,6 +53,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_short_avx512;
+            }
         }
     }
 }
-- 
2.17.1



More information about the ffmpeg-devel mailing list