[FFmpeg-cvslog] x86/tx_float: fully support 128bit regs in LOAD64_LUT
Lynne
git at videolan.org
Mon Sep 19 07:01:20 EEST 2022
ffmpeg | branch: master | Lynne <dev at lynne.ee> | Mon Sep 19 04:13:04 2022 +0200| [892548e6a1a514fc23c5bb42e549b1a0bb604b6a] | committer: Lynne
x86/tx_float: fully support 128bit regs in LOAD64_LUT
The gather path didn't support 128bit registers.
It's not faster on Zen 3, but it's here for completeness.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=892548e6a1a514fc23c5bb42e549b1a0bb604b6a
---
libavutil/x86/tx_float.asm | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 3b3e26ebcb..b644db49be 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -107,19 +107,19 @@ SECTION .text
; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
%macro LOAD64_LUT 5-7
%if %0 > 6 && cpuflag(avx2)
- pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
- movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
- vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
+ pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
+ movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
+ vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
%else
mov %5d, [%3 + %4 + 0]
movsd xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
mov %5d, [%3 + %4 + 8]
movsd xmm%6, [%2 + %5q*8]
%endif
mov %5d, [%3 + %4 + 4]
movhps xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
mov %5d, [%3 + %4 + 12]
movhps xmm%6, [%2 + %5q*8]
vinsertf128 %1, %1, xmm%6, 1
More information about the ffmpeg-cvslog
mailing list