[FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
Wu Jianhua
jianhua.wu at intel.com
Fri Aug 27 07:51:41 EEST 2021
Performance(Less is better):
8bit:
ff_hflip_byte_ssse3 0.61
ff_hflip_byte_avx2 0.37
ff_hflip_byte_avx512 0.19
16bit:
ff_hflip_short_ssse3 1.27
ff_hflip_short_avx2 0.76
ff_hflip_short_avx512 0.40
Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
---
libavfilter/x86/vf_hflip.asm | 23 ++++++++++++++++++-----
libavfilter/x86/vf_hflip_init.c | 8 ++++++++
2 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 285618954f..c2237217f7 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -26,12 +26,16 @@ SECTION_RODATA
pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
SECTION .text
;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
%macro HFLIP 3
cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
+%if mmsize == 64
+ movu m3, [pd_flip_indicies]
+%endif
VBROADCASTI128 m0, [pb_flip_%1]
xor xq, xq
%if %3 == 1
@@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
.loop0:
neg xq
-%if mmsize == 32
- vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
- vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%if mmsize == 64
+ vpermd m1, m3, [srcq + xq - mmsize + %3]
+ vpermd m2, m3, [srcq + xq - 2 * mmsize + %3]
+%elif mmsize == 32
+ vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
+ vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
%else
- movu m1, [srcq + xq - mmsize + %3]
- movu m2, [srcq + xq - 2 * mmsize + %3]
+ movu m1, [srcq + xq - mmsize + %3]
+ movu m2, [srcq + xq - 2 * mmsize + %3]
%endif
pshufb m1, m0
pshufb m2, m0
@@ -88,3 +95,9 @@ INIT_YMM avx2
HFLIP byte, b, 1
HFLIP short, w, 2
%endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 0ac399b0d4..25fc40f7b0 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -25,8 +25,10 @@
void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
{
@@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->flip_line[i] = ff_hflip_byte_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ s->flip_line[i] = ff_hflip_byte_avx512;
+ }
} else if (step[i] == 2) {
if (EXTERNAL_SSSE3(cpu_flags)) {
s->flip_line[i] = ff_hflip_short_ssse3;
@@ -48,6 +53,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->flip_line[i] = ff_hflip_short_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ s->flip_line[i] = ff_hflip_short_avx512;
+ }
}
}
}
--
2.17.1
More information about the ffmpeg-devel
mailing list