[FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions
James Almer
jamrial at gmail.com
Thu May 23 15:27:14 EEST 2024
And remove sad_8x8_avx2, as it's not faster than sad_8x8_sse4.
sad_8x8_c: 54.8
sad_8x8_sse4: 14.3
sad_16x16_c: 200.8
sad_16x16_sse4: 34.8
sad_16x16_avx2: 29.8
sad_32x32_c: 826.3
sad_32x32_sse4: 113.8
sad_32x32_avx2: 69.3
sad_64x64_c: 3679.8
sad_64x64_sse4: 392.8
sad_64x64_avx2: 257.3
sad_128x128_c: 12581.3
sad_128x128_sse4: 1560.8
sad_128x128_avx2: 1151.8
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/vvc/vvc_sad.asm | 53 +++++++++++++++++++++-----------
libavcodec/x86/vvc/vvcdsp_init.c | 42 +++++++++++++++++--------
2 files changed, 65 insertions(+), 30 deletions(-)
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 829dbce489..26df25ec66 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -26,7 +26,7 @@
SECTION_RODATA
-pw_1: times 2 dw 1
+cextern pw_1
; DMVR SAD is only calculated on even rows to reduce complexity
SECTION .text
@@ -38,20 +38,21 @@ SECTION .text
%endmacro
%macro HORIZ_ADD 3 ; xm0, xm1, m1
+%if mmsize == 32
vextracti128 %1, %3, q0001 ; 3 2 1 0
- paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
- pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2)
+ paddd %2, %1 ; xm1 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
+%endif
+ pshufd %1, %2, q0032 ; xm0 - - (7 + 3) (6 + 2)
paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2)
pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3)
paddd %1, %1, %2 ; (01234567)
%endmacro
-%if ARCH_X86_64
-%if HAVE_AVX2_EXTERNAL
-
-INIT_YMM avx2
-
-cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+%macro VVC_SAD 1
+cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
+%if UNIX64 == 0
+ mov block_hd, dword r5m
+%endif
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
@@ -74,29 +75,32 @@ cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, ro
lea src2q, [src2q + off2q * 2 + 2 * 2]
pxor m3, m3
+%if mmsize == 32
vpbroadcastd m4, [pw_1]
+%else
+ mova m4, [pw_1]
+%endif
.loop_height:
- movu xm0, [src1q]
- vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
- movu xm1, [src2q]
- vinserti128 m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
-
+ movu m0, [src1q]
+ movu m1, [src2q]
MIN_MAX_SAD m1, m0, m2
pmaddwd m1, m4
paddd m3, m1
- add src1q, 2 * MAX_PB_SIZE * ROWS * 2
- add src2q, 2 * MAX_PB_SIZE * ROWS * 2
+ add src1q, ROWS * MAX_PB_SIZE * 2
+ add src2q, ROWS * MAX_PB_SIZE * 2
- sub block_hd, 4
+ sub block_hd, 2
jg .loop_height
HORIZ_ADD xm0, xm3, m3
movd eax, xm0
RET
+%endmacro
-cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
+%macro VVC_SAD_LOOP 1
+cglobal vvc_sad_%1, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
@@ -119,7 +123,11 @@ cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
lea src2q, [src2q + off2q * 2 + 2 * 2]
pxor m3, m3
+%if mmsize == 32
vpbroadcastd m4, [pw_1]
+%else
+ mova m4, [pw_1]
+%endif
shl block_wd, 1
add src1q, block_wq
@@ -149,6 +157,15 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
HORIZ_ADD xm0, xm3, m3
movd eax, xm0
RET
+%endmacro
+%if ARCH_X86_64
+INIT_XMM sse4
+VVC_SAD 8
+VVC_SAD_LOOP 16
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VVC_SAD 16
+VVC_SAD_LOOP 32
%endif
%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index bd60963432..cdf0e36b62 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -316,16 +316,10 @@ ALF_FUNCS(16, 12, avx2)
int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
int dx, int dy, int block_w, int block_h) \
-SAD_PROTOTYPE(8, avx2);
+SAD_PROTOTYPE(8, sse4);
+SAD_PROTOTYPE(16, sse4);
SAD_PROTOTYPE(16, avx2);
-
-#define SAD_INIT(opt) do { \
- c->inter.sad[0] = ff_vvc_sad_8_##opt; \
- c->inter.sad[1] = \
- c->inter.sad[2] = \
- c->inter.sad[3] = \
- c->inter.sad[4] = ff_vvc_sad_16_##opt; \
-} while (0)
+SAD_PROTOTYPE(32, avx2);
#endif
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -337,36 +331,60 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
case 8:
if (EXTERNAL_SSE4(cpu_flags)) {
MC_LINK_SSE4(8);
+ c->inter.sad[0] = ff_vvc_sad_8_sse4;
+ c->inter.sad[1] =
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
- SAD_INIT(avx2);
+ c->inter.sad[1] = ff_vvc_sad_16_avx2;
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_32_avx2;
}
break;
case 10:
if (EXTERNAL_SSE4(cpu_flags)) {
MC_LINK_SSE4(10);
+ c->inter.sad[0] = ff_vvc_sad_8_sse4;
+ c->inter.sad[1] =
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
ALF_INIT(10);
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
- SAD_INIT(avx2);
+ c->inter.sad[1] = ff_vvc_sad_16_avx2;
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_32_avx2;
}
break;
case 12:
if (EXTERNAL_SSE4(cpu_flags)) {
MC_LINK_SSE4(12);
+ c->inter.sad[0] = ff_vvc_sad_8_sse4;
+ c->inter.sad[1] =
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
ALF_INIT(12);
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
- SAD_INIT(avx2);
+ c->inter.sad[1] = ff_vvc_sad_16_avx2;
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_32_avx2;
}
break;
default:
--
2.45.1
More information about the ffmpeg-devel
mailing list