[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.

Alan Kelly alankelly at google.com
Fri Jun 25 10:54:29 EEST 2021


Broadwell and later and Zen3 and later have fast gather instructions.
---
 Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on Broadwell,
 and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3.
 libavutil/cpu.h     |  2 ++
 libavutil/x86/cpu.c | 18 ++++++++++++++++--
 libavutil/x86/cpu.h |  1 +
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index b555422dae..f94eb79af1 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -50,6 +50,7 @@
 #define AV_CPU_FLAG_FMA4         0x0800 ///< Bulldozer FMA4 functions
 #define AV_CPU_FLAG_CMOV         0x1000 ///< supports cmov instruction
 #define AV_CPU_FLAG_AVX2         0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVX2SLOW  0x2000000 ///< AVX2 supported but gather is slower.
 #define AV_CPU_FLAG_FMA3        0x10000 ///< Haswell FMA3 functions
 #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
@@ -107,6 +108,7 @@ int av_cpu_count(void);
  *  av_set_cpu_flags_mask(), then this function will behave as if AVX is not
  *  present.
  */
+
 size_t av_cpu_max_align(void);
 
 #endif /* AVUTIL_CPU_H */
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..56fcde594c 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,20 @@ int ff_get_cpu_flags_x86(void)
     if (max_std_level >= 7) {
         cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
+        if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)){
             rval |= AV_CPU_FLAG_AVX2;
+
+            cpuid(1, eax, ebx, ecx, std_caps);
+            family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+            model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+            // Haswell and earlier has slow gather
+            if(family == 6 && model < 70)
+                rval |= AV_CPU_FLAG_AVX2SLOW;
+            // Zen 2 and earlier
+            if (!strncmp(vendor.c, "AuthenticAMD", 12) && family < 25)
+                    rval |= AV_CPU_FLAG_AVX2SLOW;
+        }
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
         if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
             if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
@@ -194,8 +206,10 @@ int ff_get_cpu_flags_x86(void)
            functions using XMM registers are always faster on them.
            AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
            used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
-            if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
+            if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)){
                 rval |= AV_CPU_FLAG_AVXSLOW;
+                rval |= AV_CPU_FLAG_AVX2SLOW;
+            }
         }
 
         /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 937c697fa0..a42a15a997 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -78,6 +78,7 @@
 #define EXTERNAL_AVX2(flags)        CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
 #define EXTERNAL_AVX2_FAST(flags)   CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX)
 #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
+#define EXTERNAL_AVX2_FAST_GATHER(flags)   CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX2)
 #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
 #define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
 
-- 
2.32.0.93.g670b81a890-goog



More information about the ffmpeg-devel mailing list