[FFmpeg-cvslog] r24340 - in trunk/libavcodec: avcodec.h x86/cpuid.c x86/dsputilenc_mmx.c x86/vp8dsp-init.c

Tue Jul 20 00:51:50 CEST 2010

On Mon, Jul 19, 2010 at 3:38 PM, rbultje <subversion at mplayerhq.hu> wrote:
> Author: rbultje
> Date: Tue Jul 20 00:38:23 2010
> New Revision: 24340
>
> Log:
> Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than
> regular MMX code. Examples of this are the Core1 CPU. Instead, set a new flag,
> FF_MM_SSE2/3SLOW, which can be checked for particular SSE2/3 functions that
> have been checked specifically on such CPUs and are actually faster than
> their MMX counterparts.

ff_idct_xvid_sse2 should be faster than mmx on SSE2SLOW, but I don't
have access to a core1 anymore to benchmark it.

I also just realized there's no way to set dsp_mask from the ffmpeg
commandline, so it's a little inconvenient to test.

>
> In addition, use this flag to enable particular VP8 and LPC SSE2 functions
> that are faster than their MMX counterparts.
>
> Modified:
> ? trunk/libavcodec/avcodec.h
> ? trunk/libavcodec/x86/cpuid.c
> ? trunk/libavcodec/x86/dsputilenc_mmx.c
> ? trunk/libavcodec/x86/vp8dsp-init.c
>
> Modified: trunk/libavcodec/avcodec.h
> ==============================================================================
> --- trunk/libavcodec/avcodec.h ?Mon Jul 19 23:53:28 2010 ? ? ? ?(r24339)
> +++ trunk/libavcodec/avcodec.h ?Tue Jul 20 00:38:23 2010 ? ? ? ?(r24340)
> @@ -1656,8 +1656,12 @@ typedef struct AVCodecContext {
> ?#define FF_MM_MMX2 ? ? 0x0002 ///< SSE integer functions or AMD MMX ext
> ?#define FF_MM_SSE ? ? ?0x0008 ///< SSE functions
> ?#define FF_MM_SSE2 ? ? 0x0010 ///< PIV SSE2 functions
> +#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?///< than regular MMX/SSE (e.g. Core1)
> ?#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
> ?#define FF_MM_SSE3 ? ? 0x0040 ///< Prescott SSE3 functions
> +#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?///< than regular MMX/SSE (e.g. Core1)
> ?#define FF_MM_SSSE3 ? ?0x0080 ///< Conroe SSSE3 functions
> ?#define FF_MM_SSE4 ? ? 0x0100 ///< Penryn SSE4.1 functions
> ?#define FF_MM_SSE42 ? ?0x0200 ///< Nehalem SSE4.2 functions
>
> Modified: trunk/libavcodec/x86/cpuid.c
> ==============================================================================
> --- trunk/libavcodec/x86/cpuid.c ? ? ? ?Mon Jul 19 23:53:28 2010 ? ? ? ?(r24339)
> +++ trunk/libavcodec/x86/cpuid.c ? ? ? ?Tue Jul 20 00:38:23 2010 ? ? ? ?(r24340)
> @@ -42,6 +42,8 @@ int mm_support(void)
> ? ? int rval = 0;
> ? ? int eax, ebx, ecx, edx;
> ? ? int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
> + ? ?int family=0, model=0;
> + ? ?union { int i[3]; char c[12]; } vendor;
>
> ?#if ARCH_X86_32
> ? ? x86_reg a, c;
> @@ -70,10 +72,12 @@ int mm_support(void)
> ? ? ? ? return 0; /* CPUID not supported */
> ?#endif
>
> - ? ?cpuid(0, max_std_level, ebx, ecx, edx);
> + ? ?cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
>
> ? ? if(max_std_level >= 1){
> ? ? ? ? cpuid(1, eax, ebx, ecx, std_caps);
> + ? ? ? ?family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
> + ? ? ? ?model ?= ((eax>>4)&0xf) + ((eax>>12)&0xf0);
> ? ? ? ? if (std_caps & (1<<23))
> ? ? ? ? ? ? rval |= FF_MM_MMX;
> ? ? ? ? if (std_caps & (1<<25))
> @@ -108,13 +112,24 @@ int mm_support(void)
> ? ? ? ? ? ? rval |= FF_MM_MMX2;
> ? ? }
>
> + ? ?if (!strncmp(vendor.c, "GenuineIntel", 12) &&
> + ? ? ? ?family == 6 && (model == 9 || model == 13 || model == 14)) {
> + ? ? ? ?/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
> + ? ? ? ? * theoretically support sse2, but it's usually slower than mmx,
> + ? ? ? ? * so let's just pretend they don't. */
> + ? ? ? ?if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2;
> + ? ? ? ?if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3;
> + ? ?}
> +
> ?#if 0
> - ? ?av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
> + ? ?av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n",
> ? ? ? ? (rval&FF_MM_MMX) ? "MMX ":"",
> ? ? ? ? (rval&FF_MM_MMX2) ? "MMX2 ":"",
> ? ? ? ? (rval&FF_MM_SSE) ? "SSE ":"",
> ? ? ? ? (rval&FF_MM_SSE2) ? "SSE2 ":"",
> + ? ? ? ?(rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"",
> ? ? ? ? (rval&FF_MM_SSE3) ? "SSE3 ":"",
> + ? ? ? ?(rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"",
> ? ? ? ? (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
> ? ? ? ? (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
> ? ? ? ? (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
>
> Modified: trunk/libavcodec/x86/dsputilenc_mmx.c
> ==============================================================================
> --- trunk/libavcodec/x86/dsputilenc_mmx.c ? ? ? Mon Jul 19 23:53:28 2010 ? ? ? ?(r24339)
> +++ trunk/libavcodec/x86/dsputilenc_mmx.c ? ? ? Tue Jul 20 00:38:23 2010 ? ? ? ?(r24340)
> @@ -1409,9 +1409,10 @@ void dsputilenc_init_mmx(DSPContext* c,
> ? ? ? ? ? ? c->sum_abs_dctelem= sum_abs_dctelem_sse2;
> ? ? ? ? ? ? c->hadamard8_diff[0]= hadamard8_diff16_sse2;
> ? ? ? ? ? ? c->hadamard8_diff[1]= hadamard8_diff_sse2;
> -#if CONFIG_LPC
> + ? ? ? ?}
> +
> + ? ? ? ?if (CONFIG_LPC && mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
> ? ? ? ? ? ? c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
> -#endif
> ? ? ? ? }
>
> ?#if HAVE_SSSE3
>
> Modified: trunk/libavcodec/x86/vp8dsp-init.c
> ==============================================================================
> --- trunk/libavcodec/x86/vp8dsp-init.c ?Mon Jul 19 23:53:28 2010 ? ? ? ?(r24339)
> +++ trunk/libavcodec/x86/vp8dsp-init.c ?Tue Jul 20 00:38:23 2010 ? ? ? ?(r24340)
> @@ -328,7 +328,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPCo
> ? ? ? ? c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
> ? ? }
>
> - ? ?if (mm_flags & FF_MM_SSE2) {
> + ? ?if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
> ? ? ? ? VP8_LUMA_MC_FUNC(0, 16, sse2);
> ? ? ? ? VP8_MC_FUNC(1, 8, sse2);
> ? ? ? ? VP8_BILINEAR_MC_FUNC(0, 16, sse2);
> @@ -338,8 +338,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPCo
> ? ? ? ? c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
>
> ? ? ? ? c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
> - ? ? ? ?c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
> ? ? ? ? c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
> + ? ?}
> +
> + ? ?if (mm_flags & FF_MM_SSE2) {
> + ? ? ? ?c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
> ? ? ? ? c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
> ? ? }
>
> _______________________________________________
> ffmpeg-cvslog mailing list
> ffmpeg-cvslog at mplayerhq.hu
> https://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-cvslog
>