[FFmpeg-devel] [PATCH] x86/cpu: add AV_CPU_FLAG_AVXSLOW flag

James Almer jamrial at gmail.com
Sat May 23 03:17:26 CEST 2015


Signed-off-by: James Almer <jamrial at gmail.com>
---
 doc/APIchanges      |  3 +++
 libavutil/cpu.c     |  5 +++++
 libavutil/cpu.h     |  1 +
 libavutil/version.h |  4 ++--
 libavutil/x86/cpu.c | 17 ++++++++++++++---
 5 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/doc/APIchanges b/doc/APIchanges
index 3cca389..6fa40b2 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -15,6 +15,9 @@ libavutil:     2014-08-09
 
 API changes, most recent first:
 
+2015-xx-xx - xxxxxxx - lavu 54.24.100 - cpu.h
+  Add AV_CPU_FLAG_AVXSLOW.
+
 2015-05-13 - xxxxxxx - lavc 56.39.100 / 56.23.0
   Add av_vda_default_init2.
 
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 765577d..780368d 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -59,6 +59,7 @@ void av_force_cpu_flags(int arg){
                     AV_CPU_FLAG_SSE4     |
                     AV_CPU_FLAG_SSE42    |
                     AV_CPU_FLAG_AVX      |
+                    AV_CPU_FLAG_AVXSLOW  |
                     AV_CPU_FLAG_XOP      |
                     AV_CPU_FLAG_FMA3     |
                     AV_CPU_FLAG_FMA4     |
@@ -111,6 +112,7 @@ int av_parse_cpu_flags(const char *s)
 #define CPUFLAG_SSE4     (AV_CPU_FLAG_SSE4     | CPUFLAG_SSSE3)
 #define CPUFLAG_SSE42    (AV_CPU_FLAG_SSE42    | CPUFLAG_SSE4)
 #define CPUFLAG_AVX      (AV_CPU_FLAG_AVX      | CPUFLAG_SSE42)
+#define CPUFLAG_AVXSLOW  (AV_CPU_FLAG_AVXSLOW  | CPUFLAG_AVX)
 #define CPUFLAG_XOP      (AV_CPU_FLAG_XOP      | CPUFLAG_AVX)
 #define CPUFLAG_FMA3     (AV_CPU_FLAG_FMA3     | CPUFLAG_AVX)
 #define CPUFLAG_FMA4     (AV_CPU_FLAG_FMA4     | CPUFLAG_AVX)
@@ -133,6 +135,7 @@ int av_parse_cpu_flags(const char *s)
         { "sse4.1"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE4         },    .unit = "flags" },
         { "sse4.2"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42        },    .unit = "flags" },
         { "avx"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX          },    .unit = "flags" },
+        { "avxslow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVXSLOW      },    .unit = "flags" },
         { "xop"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP          },    .unit = "flags" },
         { "fma3"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3         },    .unit = "flags" },
         { "fma4"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4         },    .unit = "flags" },
@@ -192,6 +195,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
         { "sse4.1"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE4     },    .unit = "flags" },
         { "sse4.2"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE42    },    .unit = "flags" },
         { "avx"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX      },    .unit = "flags" },
+        { "avxslow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVXSLOW  },    .unit = "flags" },
         { "xop"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_XOP      },    .unit = "flags" },
         { "fma3"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA3     },    .unit = "flags" },
         { "fma4"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA4     },    .unit = "flags" },
@@ -320,6 +324,7 @@ static const struct {
     { AV_CPU_FLAG_SSE4,      "sse4.1"     },
     { AV_CPU_FLAG_SSE42,     "sse4.2"     },
     { AV_CPU_FLAG_AVX,       "avx"        },
+    { AV_CPU_FLAG_AVXSLOW,   "avxslow"    },
     { AV_CPU_FLAG_XOP,       "xop"        },
     { AV_CPU_FLAG_FMA3,      "fma3"       },
     { AV_CPU_FLAG_FMA4,      "fma4"       },
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index 277e489..10b7136 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -43,6 +43,7 @@
 #define AV_CPU_FLAG_SSE4         0x0100 ///< Penryn SSE4.1 functions
 #define AV_CPU_FLAG_SSE42        0x0200 ///< Nehalem SSE4.2 functions
 #define AV_CPU_FLAG_AVX          0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVXSLOW   0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
 #define AV_CPU_FLAG_XOP          0x0400 ///< Bulldozer XOP functions
 #define AV_CPU_FLAG_FMA4         0x0800 ///< Bulldozer FMA4 functions
 // #if LIBAVUTIL_VERSION_MAJOR <52
diff --git a/libavutil/version.h b/libavutil/version.h
index 18a2295..eeafcfa 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -56,8 +56,8 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  54
-#define LIBAVUTIL_VERSION_MINOR  23
-#define LIBAVUTIL_VERSION_MICRO 101
+#define LIBAVUTIL_VERSION_MINOR  24
+#define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
                                                LIBAVUTIL_VERSION_MINOR, \
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 2b62e92..7a5d4e6 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -167,6 +167,7 @@ int ff_get_cpu_flags_x86(void)
         if (ext_caps & (1 << 22))
             rval |= AV_CPU_FLAG_MMXEXT;
 
+        if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
         /* Allow for selectively disabling SSE2 functions on AMD processors
            with SSE2 support but not SSE4a. This includes Athlon64, some
            Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
@@ -174,9 +175,19 @@ int ff_get_cpu_flags_x86(void)
            AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
            so that SSE2 is used unless explicitly disabled by checking
            AV_CPU_FLAG_SSE2SLOW. */
-        if (!strncmp(vendor.c, "AuthenticAMD", 12) &&
-            rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) {
-            rval |= AV_CPU_FLAG_SSE2SLOW;
+            if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
+                rval |= AV_CPU_FLAG_SSE2SLOW;
+
+        /* Similar to the above but for AVX functions on AMD processors.
+           This is necessary only for functions using YMM registers on Bulldozer
+           based CPUs as they lack 256-bits execution units. SSE/AVX functions
+           using XMM registers are always faster on them.
+           AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
+           used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW.
+           TODO: Confirm if Excavator is affected or not by this once it's
+                 released, and update the check if necessary. Same for btver2. */
+            if (family == 0x15 && (rval & AV_CPU_FLAG_AVX))
+                rval |= AV_CPU_FLAG_AVXSLOW;
         }
 
         /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-- 
2.4.1



More information about the ffmpeg-devel mailing list