[FFmpeg-devel] [PATCH 2/2 v2] x86/takdsp: add avx2 versions of all functions
James Almer
jamrial at gmail.com
Sat Dec 23 01:52:58 EET 2023
On an Intel Core i7 12700k:
decorrelate_ls_c: 814.3
decorrelate_ls_sse2: 165.8
decorrelate_ls_avx2: 101.3
decorrelate_sf_c: 1602.6
decorrelate_sf_sse4: 640.1
decorrelate_sf_avx2: 324.6
decorrelate_sm_c: 1564.8
decorrelate_sm_sse2: 379.3
decorrelate_sm_avx2: 203.3
decorrelate_sr_c: 785.3
decorrelate_sr_sse2: 176.3
decorrelate_sr_avx2: 99.8
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/takdsp.asm | 41 ++++++++++++++++++++++++------------
libavcodec/x86/takdsp_init.c | 11 ++++++++++
2 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
index be8e1ab553..d55c5f39aa 100644
--- a/libavcodec/x86/takdsp.asm
+++ b/libavcodec/x86/takdsp.asm
@@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
-INIT_XMM sse2
+%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
- mova m2, m1
- mova m5, m4
- psrad m2, 1
- psrad m5, 1
+ psrad m2, m1, 1
+ psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@@ -88,29 +86,44 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
+%endmacro
-INIT_XMM sse4
+INIT_XMM sse2
+TAK_DECORRELATE
+INIT_YMM avx2
+TAK_DECORRELATE
+
+%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
- movd m2, dshiftm
- movd m3, dfactorm
- pshufd m3, m3, 0
- mova m4, [pd_128]
+ movd xm2, dshiftm
+%if UNIX64
+ movd xm3, dfactorm
+ VPBROADCASTD m3, xm3
+%else
+ VPBROADCASTD m3, dfactorm
+%endif
+ VBROADCASTI128 m4, [pd_128]
.loop:
- mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
- psrad m1, m2
+ psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
- pslld m1, m2
- psubd m1, m0
+ pslld m1, xm2
+ psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+TAK_DECORRELATE_SF
+INIT_YMM avx2
+TAK_DECORRELATE_SF
diff --git a/libavcodec/x86/takdsp_init.c b/libavcodec/x86/takdsp_init.c
index 12b62b8247..9553f8442c 100644
--- a/libavcodec/x86/takdsp_init.c
+++ b/libavcodec/x86/takdsp_init.c
@@ -24,9 +24,13 @@
#include "config.h"
void ff_tak_decorrelate_ls_sse2(const int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_ls_avx2(const int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sr_sse2(int32_t *p1, const int32_t *p2, int length);
+void ff_tak_decorrelate_sr_avx2(int32_t *p1, const int32_t *p2, int length);
void ff_tak_decorrelate_sm_sse2(int32_t *p1, int32_t *p2, int length);
+void ff_tak_decorrelate_sm_avx2(int32_t *p1, int32_t *p2, int length);
void ff_tak_decorrelate_sf_sse4(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
+void ff_tak_decorrelate_sf_avx2(int32_t *p1, const int32_t *p2, int length, int dshift, int dfactor);
av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
{
@@ -42,5 +46,12 @@ av_cold void ff_takdsp_init_x86(TAKDSPContext *c)
if (EXTERNAL_SSE4(cpu_flags)) {
c->decorrelate_sf = ff_tak_decorrelate_sf_sse4;
}
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ c->decorrelate_ls = ff_tak_decorrelate_ls_avx2;
+ c->decorrelate_sr = ff_tak_decorrelate_sr_avx2;
+ c->decorrelate_sm = ff_tak_decorrelate_sm_avx2;
+ c->decorrelate_sf = ff_tak_decorrelate_sf_avx2;
+ }
#endif
}
--
2.43.0
More information about the ffmpeg-devel
mailing list