[FFmpeg-devel] [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag

Wu, Jianhua jianhua.wu at intel.com
Wed Mar 2 07:33:57 EET 2022


Ping.
> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu at intel.com>
> Sent: Wednesday, February 23, 2022 4:58 PM
> To: ffmpeg-devel at ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu at intel.com>
> Subject: [PATCH 1/6] avutil/cpu: add AVX512 Icelake flag
> 
> From: Wu Jianhua <jianhua.wu at intel.com>
> 
> Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
> ---
>  configure                 | 13 +++++++---
>  libavutil/cpu.c           |  1 +
>  libavutil/cpu.h           |  1 +
>  libavutil/x86/cpu.c       |  8 ++++--
>  libavutil/x86/cpu.h       |  1 +
>  libavutil/x86/x86inc.asm  | 53 ++++++++++++++++++++-------------------
>  tests/checkasm/checkasm.c | 35 +++++++++++++-------------
>  7 files changed, 63 insertions(+), 49 deletions(-)
> 
> diff --git a/configure b/configure
> index 1535dc3c5b..d88c2ae979 100755
> --- a/configure
> +++ b/configure
> @@ -444,6 +444,7 @@ Optimization options (experts only):
>    --disable-fma4           disable FMA4 optimizations
>    --disable-avx2           disable AVX2 optimizations
>    --disable-avx512         disable AVX-512 optimizations
> +  --disable-avx512icl      disable AVX-512ICL optimizations
>    --disable-aesni          disable AESNI optimizations
>    --disable-armv5te        disable armv5te optimizations
>    --disable-armv6          disable armv6 optimizations
> @@ -2098,6 +2099,7 @@ ARCH_EXT_LIST_X86_SIMD="
>      avx
>      avx2
>      avx512
> +    avx512icl
>      fma3
>      fma4
>      mmx
> @@ -2666,6 +2668,7 @@ fma3_deps="avx"
>  fma4_deps="avx"
>  avx2_deps="avx"
>  avx512_deps="avx2"
> +avx512icl_deps="avx512"
> 
>  mmx_external_deps="x86asm"
>  mmx_inline_deps="inline_asm x86"
> @@ -6128,10 +6131,11 @@ EOF
>              elf*) enabled debug && append X86ASMFLAGS $x86asm_debug ;;
>          esac
> 
> -        enabled avx512 && check_x86asm avx512_external "vmovdqa32
> [eax]{k1}{z}, zmm0"
> -        enabled avx2   && check_x86asm avx2_external   "vextracti128 xmm0,
> ymm0, 0"
> -        enabled xop    && check_x86asm xop_external    "vpmacsdd xmm0,
> xmm1, xmm2, xmm3"
> -        enabled fma4   && check_x86asm fma4_external   "vfmaddps ymm0,
> ymm1, ymm2, ymm3"
> +        enabled avx512    && check_x86asm avx512_external    "vmovdqa32
> [eax]{k1}{z}, zmm0"
> +        enabled avx512icl && check_x86asm avx512icl_external "vpdpwssds
> zmm31{k1}{z}, zmm29, zmm28"
> +        enabled avx2      && check_x86asm avx2_external      "vextracti128
> xmm0, ymm0, 0"
> +        enabled xop       && check_x86asm xop_external       "vpmacsdd xmm0,
> xmm1, xmm2, xmm3"
> +        enabled fma4      && check_x86asm fma4_external      "vfmaddps ymm0,
> ymm1, ymm2, ymm3"
>          check_x86asm cpunop          "CPU amdnop"
>      fi
> 
> @@ -7471,6 +7475,7 @@ if enabled x86; then
>      echo "AVX enabled               ${avx-no}"
>      echo "AVX2 enabled              ${avx2-no}"
>      echo "AVX-512 enabled           ${avx512-no}"
> +    echo "AVX-512ICL enabled        ${avx512icl-no}"
>      echo "XOP enabled               ${xop-no}"
>      echo "FMA3 enabled              ${fma3-no}"
>      echo "FMA4 enabled              ${fma4-no}"
> diff --git a/libavutil/cpu.c b/libavutil/cpu.c index 1368502245..833c220192
> 100644
> --- a/libavutil/cpu.c
> +++ b/libavutil/cpu.c
> @@ -137,6 +137,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
>          { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_CMOV     },    .unit = "flags" },
>          { "aesni",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_AESNI    },    .unit = "flags" },
>          { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_AVX512   },    .unit = "flags" },
> +        { "avx512icl",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_AVX512ICL   }, .unit = "flags" },
>          { "slowgather", NULL, 0, AV_OPT_TYPE_CONST, { .i64 =
> AV_CPU_FLAG_SLOW_GATHER }, .unit = "flags" },
> 
>  #define CPU_FLAG_P2 AV_CPU_FLAG_CMOV | AV_CPU_FLAG_MMX diff --
> git a/libavutil/cpu.h b/libavutil/cpu.h index ce9bf14bf7..9711e574c5 100644
> --- a/libavutil/cpu.h
> +++ b/libavutil/cpu.h
> @@ -54,6 +54,7 @@
>  #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction
> Set 1
>  #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction
> Set 2
>  #define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions:
> requires OS support even if YMM/ZMM registers aren't used
> +#define AV_CPU_FLAG_AVX512ICL  0x200000 ///<
> +F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/VPOPCNTDQ/BITALG/GFNI/V
> AES/VPCLMULQD
> +Q
>  #define AV_CPU_FLAG_SLOW_GATHER  0x2000000 ///< CPU has slow
> gathers.
> 
>  #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index
> 7b13fcae91..d6cd4fab9c 100644
> --- a/libavutil/x86/cpu.c
> +++ b/libavutil/x86/cpu.c
> @@ -150,9 +150,13 @@ int ff_get_cpu_flags_x86(void)
>              rval |= AV_CPU_FLAG_AVX2;
>  #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
>          if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
> -            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) ==
> 0xd0030000)
> +            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) ==
> + 0xd0030000) {
>                  rval |= AV_CPU_FLAG_AVX512;
> -
> +#if HAVE_AVX512ICL
> +                if ((ebx & 0xd0200000) == 0xd0200000 && (ecx & 0x5f42) == 0x5f42)
> +                    rval |= AV_CPU_FLAG_AVX512ICL; #endif /*
> +HAVE_AVX512ICL */
> +            }
>          }
>  #endif /* HAVE_AVX512 */
>  #endif /* HAVE_AVX2 */
> diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index
> 937c697fa0..40a1eef0ab 100644
> --- a/libavutil/x86/cpu.h
> +++ b/libavutil/x86/cpu.h
> @@ -80,6 +80,7 @@
>  #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags,
> _EXTERNAL, AVX2, AVX)
>  #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL,
> AESNI)
>  #define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL,
> AVX512)
> +#define EXTERNAL_AVX512ICL(flags)   CPUEXT_SUFFIX(flags, _EXTERNAL,
> AVX512ICL)
> 
>  #define INLINE_AMD3DNOW(flags)      CPUEXT_SUFFIX(flags, _INLINE,
> AMD3DNOW)
>  #define INLINE_AMD3DNOWEXT(flags)   CPUEXT_SUFFIX(flags, _INLINE,
> AMD3DNOWEXT)
> diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index
> 01c35e3a4b..251ee797de 100644
> --- a/libavutil/x86/x86inc.asm
> +++ b/libavutil/x86/x86inc.asm
> @@ -817,32 +817,33 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge,
> jng, jnge, ja, jae,
> 
>  ; cpuflags
> 
> -%assign cpuflags_mmx      (1<<0)
> -%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
> -%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
> -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
> -%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
> -%assign cpuflags_sse2     (1<<5) | cpuflags_sse
> -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
> -%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
> -%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
> -%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
> -%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
> -%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
> -%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
> -%assign cpuflags_avx      (1<<13)| cpuflags_sse42
> -%assign cpuflags_xop      (1<<14)| cpuflags_avx
> -%assign cpuflags_fma4     (1<<15)| cpuflags_avx
> -%assign cpuflags_fma3     (1<<16)| cpuflags_avx
> -%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
> -%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
> -%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
> -%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
> -
> -%assign cpuflags_cache32  (1<<21)
> -%assign cpuflags_cache64  (1<<22)
> -%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
> -%assign cpuflags_atom     (1<<24)
> +%assign cpuflags_mmx       (1<<0)
> +%assign cpuflags_mmx2      (1<<1) | cpuflags_mmx
> +%assign cpuflags_3dnow     (1<<2) | cpuflags_mmx
> +%assign cpuflags_3dnowext  (1<<3) | cpuflags_3dnow
> +%assign cpuflags_sse       (1<<4) | cpuflags_mmx2
> +%assign cpuflags_sse2      (1<<5) | cpuflags_sse
> +%assign cpuflags_sse2slow  (1<<6) | cpuflags_sse2
> +%assign cpuflags_lzcnt     (1<<7) | cpuflags_sse2
> +%assign cpuflags_sse3      (1<<8) | cpuflags_sse2
> +%assign cpuflags_ssse3     (1<<9) | cpuflags_sse3
> +%assign cpuflags_sse4      (1<<10)| cpuflags_ssse3
> +%assign cpuflags_sse42     (1<<11)| cpuflags_sse4
> +%assign cpuflags_aesni     (1<<12)| cpuflags_sse42
> +%assign cpuflags_avx       (1<<13)| cpuflags_sse42
> +%assign cpuflags_xop       (1<<14)| cpuflags_avx
> +%assign cpuflags_fma4      (1<<15)| cpuflags_avx
> +%assign cpuflags_fma3      (1<<16)| cpuflags_avx
> +%assign cpuflags_bmi1      (1<<17)| cpuflags_avx|cpuflags_lzcnt
> +%assign cpuflags_bmi2      (1<<18)| cpuflags_bmi1
> +%assign cpuflags_avx2      (1<<19)| cpuflags_fma3|cpuflags_bmi2
> +%assign cpuflags_avx512    (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
> +%assign cpuflags_avx512icl (1<<25)| cpuflags_avx512
> +
> +%assign cpuflags_cache32   (1<<21)
> +%assign cpuflags_cache64   (1<<22)
> +%assign cpuflags_aligned   (1<<23) ; not a cpu feature, but a function variant
> +%assign cpuflags_atom      (1<<24)
> 
>  ; Returns a boolean value expressing whether or not the specified cpuflag is
> enabled.
>  %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) -
> 1) >> 31) & 1)
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index
> f74125e810..e77b4ec20f 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -220,23 +220,24 @@ static const struct {
>      { "MMI",      "mmi",      AV_CPU_FLAG_MMI },
>      { "MSA",      "msa",      AV_CPU_FLAG_MSA },
>  #elif ARCH_X86
> -    { "MMX",      "mmx",      AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
> -    { "MMXEXT",   "mmxext",   AV_CPU_FLAG_MMXEXT },
> -    { "3DNOW",    "3dnow",    AV_CPU_FLAG_3DNOW },
> -    { "3DNOWEXT", "3dnowext", AV_CPU_FLAG_3DNOWEXT },
> -    { "SSE",      "sse",      AV_CPU_FLAG_SSE },
> -    { "SSE2",     "sse2",     AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
> -    { "SSE3",     "sse3",     AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
> -    { "SSSE3",    "ssse3",    AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
> -    { "SSE4.1",   "sse4",     AV_CPU_FLAG_SSE4 },
> -    { "SSE4.2",   "sse42",    AV_CPU_FLAG_SSE42 },
> -    { "AES-NI",   "aesni",    AV_CPU_FLAG_AESNI },
> -    { "AVX",      "avx",      AV_CPU_FLAG_AVX },
> -    { "XOP",      "xop",      AV_CPU_FLAG_XOP },
> -    { "FMA3",     "fma3",     AV_CPU_FLAG_FMA3 },
> -    { "FMA4",     "fma4",     AV_CPU_FLAG_FMA4 },
> -    { "AVX2",     "avx2",     AV_CPU_FLAG_AVX2 },
> -    { "AVX-512",  "avx512",   AV_CPU_FLAG_AVX512 },
> +    { "MMX",        "mmx",       AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
> +    { "MMXEXT",     "mmxext",    AV_CPU_FLAG_MMXEXT },
> +    { "3DNOW",      "3dnow",     AV_CPU_FLAG_3DNOW },
> +    { "3DNOWEXT",   "3dnowext",  AV_CPU_FLAG_3DNOWEXT },
> +    { "SSE",        "sse",       AV_CPU_FLAG_SSE },
> +    { "SSE2",       "sse2",      AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW },
> +    { "SSE3",       "sse3",      AV_CPU_FLAG_SSE3|AV_CPU_FLAG_SSE3SLOW },
> +    { "SSSE3",      "ssse3",     AV_CPU_FLAG_SSSE3|AV_CPU_FLAG_ATOM },
> +    { "SSE4.1",     "sse4",      AV_CPU_FLAG_SSE4 },
> +    { "SSE4.2",     "sse42",     AV_CPU_FLAG_SSE42 },
> +    { "AES-NI",     "aesni",     AV_CPU_FLAG_AESNI },
> +    { "AVX",        "avx",       AV_CPU_FLAG_AVX },
> +    { "XOP",        "xop",       AV_CPU_FLAG_XOP },
> +    { "FMA3",       "fma3",      AV_CPU_FLAG_FMA3 },
> +    { "FMA4",       "fma4",      AV_CPU_FLAG_FMA4 },
> +    { "AVX2",       "avx2",      AV_CPU_FLAG_AVX2 },
> +    { "AVX-512",    "avx512",    AV_CPU_FLAG_AVX512 },
> +    { "AVX-512ICL", "avx512icl", AV_CPU_FLAG_AVX512ICL },
>  #elif ARCH_LOONGARCH
>      { "LSX",      "lsx",      AV_CPU_FLAG_LSX },
>      { "LASX",     "lasx",     AV_CPU_FLAG_LASX },
> --
> 2.17.1



More information about the ffmpeg-devel mailing list