[FFmpeg-cvslog] x86/tx_float: fully support 128bit regs in LOAD64_LUT

Mon Sep 19 07:01:20 EEST 2022

ffmpeg | branch: master | Lynne <dev at lynne.ee> | Mon Sep 19 04:13:04 2022 +0200| [892548e6a1a514fc23c5bb42e549b1a0bb604b6a] | committer: Lynne

x86/tx_float: fully support 128bit regs in LOAD64_LUT

The gather path didn't support 128bit registers.
It's not faster on Zen 3, but it's here for completeness.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=892548e6a1a514fc23c5bb42e549b1a0bb604b6a
---

 libavutil/x86/tx_float.asm | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 3b3e26ebcb..b644db49be 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -107,19 +107,19 @@ SECTION .text
 ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
 %macro LOAD64_LUT 5-7
 %if %0 > 6 && cpuflag(avx2)
-    pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
-    movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
-    vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
+    pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
+    movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
+    vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
 %else
     mov      %5d, [%3 + %4 + 0]
     movsd  xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
     mov      %5d, [%3 + %4 + 8]
     movsd  xmm%6, [%2 + %5q*8]
 %endif
     mov      %5d, [%3 + %4 + 4]
     movhps xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
     mov      %5d, [%3 + %4 + 12]
     movhps xmm%6, [%2 + %5q*8]
     vinsertf128 %1, %1, xmm%6, 1