diff -Naur /trunk/mplayer-r38142/configure /trunk/mplayer-r38143/configure --- /trunk/mplayer-r38142/configure 2020-05-21 12:44:18.411132800 +0200 +++ /trunk/mplayer-r38143/configure 2020-05-21 12:44:51.348632800 +0200 @@ -3883,6 +3883,26 @@ def_poll_h='#define HAVE_POLL_H 1' echores "$poll_h" +echocheck "emmintrin.h (SSE intrinsics)" +emmintrin_h=no +def_emmintrin_h='#define HAVE_EMMINTRIN_H 0' + cat > $TMPC << EOF +#include + +__attribute__((target("sse2"))) +static int sse2test(int i) { + __m128i mmi = _mm_set1_epi16(i); + mmi = _mm_add_epi16(mmi, mmi); + return _mm_extract_epi16(mmi, 2); +} + +int main(int argc, char **argv) { + return sse2test(argc); +} +EOF +cc_check && emmintrin_h=yes && + def_emmintrin_h='#define HAVE_EMMINTRIN_H 1' +echores "$emmintrin_h" echocheck "inttypes.h (required)" _inttypes=no @@ -9243,6 +9263,12 @@ $def_io_h $def_poll_h $def_windows_h +$def_emmintrin_h +#if ARCH_X86_32 +#define ATTR_TARGET_SSE2 __attribute__((target("sse2"))) +#else +#define ATTR_TARGET_SSE2 +#endif /* external libraries */ $def_bzlib diff -Naur /trunk/mplayer-r38142/sub/osd.c /trunk/mplayer-r38143/sub/osd.c --- /trunk/mplayer-r38142/sub/osd.c 2020-05-21 12:44:20.567382800 +0200 +++ /trunk/mplayer-r38143/sub/osd.c 2020-05-21 12:44:51.348632800 +0200 @@ -31,7 +31,12 @@ #include "libmpcodecs/img_format.h" #include "cpudetect.h" -#if ARCH_X86 +#if !HAVE_EMMINTRIN_H +#undef HAVE_SSE2 +#define HAVE_SSE2 0 +#endif + +#if ARCH_X86 && (!HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT) static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL; static const unsigned long long mask24lh __attribute__((aligned(8))) = 0xFFFF000000000000ULL; static const unsigned long long mask24hl __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL; @@ -39,32 +44,38 @@ //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one //Plain C versions -#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT +#if (!HAVE_MMX && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT #define COMPILE_C #endif #if ARCH_X86 -#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT +#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT #define COMPILE_MMX #endif -#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT +#if (HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT #define COMPILE_MMX2 #endif -#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT +#if (HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT #define COMPILE_3DNOW #endif +#if HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT +#define COMPILE_SSE2 +#endif + #endif /* ARCH_X86 */ #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_AMD3DNOW +#undef HAVE_SSE2 #define HAVE_MMX 0 #define HAVE_MMX2 0 #define HAVE_AMD3DNOW 0 +#define HAVE_SSE2 0 #if ! ARCH_X86 @@ -72,9 +83,11 @@ #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_AMD3DNOW +#undef HAVE_SSE2 #define HAVE_MMX 0 #define HAVE_MMX2 0 #define HAVE_AMD3DNOW 0 +#define HAVE_SSE2 0 #define RENAME(a) a ## _C #include "osd_template.c" #endif @@ -87,9 +100,11 @@ #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_AMD3DNOW +#undef HAVE_SSE2 #define HAVE_MMX 0 #define HAVE_MMX2 0 #define HAVE_AMD3DNOW 0 +#define HAVE_SSE2 0 #define RENAME(a) a ## _X86 #include "osd_template.c" #endif @@ -100,9 +115,11 @@ #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_AMD3DNOW +#undef HAVE_SSE2 #define HAVE_MMX 1 #define HAVE_MMX2 0 #define HAVE_AMD3DNOW 0 +#define HAVE_SSE2 0 #define RENAME(a) a ## _MMX #include "osd_template.c" #endif @@ -113,9 +130,11 @@ #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_AMD3DNOW +#undef HAVE_SSE2 #define HAVE_MMX 1 #define HAVE_MMX2 1 #define HAVE_AMD3DNOW 0 +#define HAVE_SSE2 0 #define RENAME(a) a ## _MMX2 #include "osd_template.c" #endif @@ -126,20 +145,39 @@ #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_AMD3DNOW +#undef HAVE_SSE2 #define HAVE_MMX 1 #define HAVE_MMX2 0 #define HAVE_AMD3DNOW 1 +#define HAVE_SSE2 0 #define RENAME(a) a ## _3DNow #include "osd_template.c" #endif +//SSE2 versions +#ifdef COMPILE_SSE2 +#undef RENAME +#undef HAVE_MMX +#undef HAVE_MMX2 +#undef HAVE_AMD3DNOW +#undef HAVE_SSE2 +#define HAVE_MMX 0 +#define HAVE_MMX2 0 +#define HAVE_AMD3DNOW 0 +#define HAVE_SSE2 1 +#define RENAME(a) a ## _SSE2 +#include +#include "osd_template.c" +#endif #endif /* ARCH_X86 */ void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ #if CONFIG_RUNTIME_CPUDETECT #if ARCH_X86 // ordered by speed / fastest first - if(gCpuCaps.hasMMX2) + if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2) + vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX2) vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride); else if(gCpuCaps.has3DNow) vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -151,7 +189,9 @@ vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride); #endif #else //CONFIG_RUNTIME_CPUDETECT -#if HAVE_MMX2 +#if HAVE_SSE2 + vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride); +#elif HAVE_MMX2 vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride); #elif HAVE_AMD3DNOW vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -169,7 +209,9 @@ #if CONFIG_RUNTIME_CPUDETECT #if ARCH_X86 // ordered by speed / fastest first - if(gCpuCaps.hasMMX2) + if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2) + vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX2) vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride); else if(gCpuCaps.has3DNow) vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -181,7 +223,9 @@ vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride); #endif #else //CONFIG_RUNTIME_CPUDETECT -#if HAVE_MMX2 +#if HAVE_SSE2 + vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride); +#elif HAVE_MMX2 vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride); #elif HAVE_AMD3DNOW vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -199,7 +243,9 @@ #if CONFIG_RUNTIME_CPUDETECT #if ARCH_X86 // ordered by speed / fastest first - if(gCpuCaps.hasMMX2) + if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2) + vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX2) vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride); else if(gCpuCaps.has3DNow) vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -211,7 +257,9 @@ vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride); #endif #else //CONFIG_RUNTIME_CPUDETECT -#if HAVE_MMX2 +#if HAVE_SSE2 + vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride); +#elif HAVE_MMX2 vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride); #elif HAVE_AMD3DNOW vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -229,7 +277,9 @@ #if CONFIG_RUNTIME_CPUDETECT #if ARCH_X86 // ordered by speed / fastest first - if(gCpuCaps.hasMMX2) + if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2) + vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX2) vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride); else if(gCpuCaps.has3DNow) vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -241,7 +291,9 @@ vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride); #endif #else //CONFIG_RUNTIME_CPUDETECT -#if HAVE_MMX2 +#if HAVE_SSE2 + vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride); +#elif HAVE_MMX2 vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride); #elif HAVE_AMD3DNOW vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -259,7 +311,9 @@ #if CONFIG_RUNTIME_CPUDETECT #if ARCH_X86 // ordered by speed / fastest first - if(gCpuCaps.hasMMX2) + if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2) + vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride); + else if(gCpuCaps.hasMMX2) vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride); else if(gCpuCaps.has3DNow) vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -271,7 +325,9 @@ vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride); #endif #else //CONFIG_RUNTIME_CPUDETECT -#if HAVE_MMX2 +#if HAVE_SSE2 + vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride); +#elif HAVE_MMX2 vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride); #elif HAVE_AMD3DNOW vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride); @@ -306,7 +362,9 @@ #if CONFIG_RUNTIME_CPUDETECT #if ARCH_X86 // ordered per speed fasterst first - if(gCpuCaps.hasMMX2) + if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2) + mp_msg(MSGT_OSD,MSGL_INFO,"Using SSE2 Optimized OnScreenDisplay\n"); + else if(gCpuCaps.hasMMX2) mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n"); else if(gCpuCaps.has3DNow) mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n"); @@ -318,7 +376,9 @@ mp_msg(MSGT_OSD,MSGL_INFO,"Using Unoptimized OnScreenDisplay\n"); #endif #else //CONFIG_RUNTIME_CPUDETECT -#if HAVE_MMX2 +#if HAVE_SSE2 + mp_msg(MSGT_OSD,MSGL_INFO,"Using SSE2 Optimized OnScreenDisplay\n"); +#elif HAVE_MMX2 mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n"); #elif HAVE_AMD3DNOW mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n"); diff -Naur /trunk/mplayer-r38142/sub/osd_template.c /trunk/mplayer-r38143/sub/osd_template.c --- /trunk/mplayer-r38142/sub/osd_template.c 2020-05-21 12:44:20.583007800 +0200 +++ /trunk/mplayer-r38143/sub/osd_template.c 2020-05-21 12:44:51.348632800 +0200 @@ -44,9 +44,60 @@ #define EMMS "emms" #endif +#if HAVE_SSE2 +ATTR_TARGET_SSE2 +static inline __m128i muladd_src_unpacked(__m128i dstlo, __m128i dsthi, __m128i src, __m128i srcalo, __m128i srcahi) +{ + // (mmhigh,mmlow) = (dst * (srca * 256)) / 65536 (= (dst * srca) >> 8) + __m128i mmlow = _mm_mulhi_epu16(dstlo, srcalo); + __m128i mmhigh = _mm_mulhi_epu16(dsthi, srcahi); + + __m128i res = _mm_packus_epi16(mmlow, mmhigh); + + return _mm_add_epi8(res, src); +} + +ATTR_TARGET_SSE2 +static inline __m128i muladd_src(__m128i dst, __m128i src, __m128i srca) +{ + __m128i zero = _mm_setzero_si128(); + __m128i dstlo = _mm_unpacklo_epi8(dst, zero); + __m128i dsthi = _mm_unpackhi_epi8(dst, zero); + __m128i srcalo = _mm_unpacklo_epi8(zero, srca); + __m128i srcahi = _mm_unpackhi_epi8(zero, srca); + return muladd_src_unpacked(dstlo, dsthi, src, srcalo, srcahi); +} + +ATTR_TARGET_SSE2 +static inline __m128i alphamask(__m128i orig, __m128i blended, __m128i srca) +{ + __m128i zero = _mm_setzero_si128(); + // if (!srca) res |= dst --- assumes srca == 0 implies src == 0, + // thus no need to mask res + __m128i mask = _mm_cmpeq_epi8(srca, zero); + orig = _mm_and_si128(orig, mask); + return _mm_or_si128(blended, orig); +} + +// Special version that compares alpha in 16 bit chunks instead of bytewise +ATTR_TARGET_SSE2 +static inline __m128i alphamask16(__m128i orig, __m128i blended, __m128i srca) +{ + __m128i zero = _mm_setzero_si128(); + // if (!srca) res |= dst --- assumes srca == 0 implies src == 0, + // thus no need to mask res + __m128i mask = _mm_cmpeq_epi16(srca, zero); + orig = _mm_and_si128(orig, mask); + return _mm_or_si128(blended, orig); +} +#endif + +#if HAVE_SSE2 +ATTR_TARGET_SSE2 +#endif static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){ int y; -#if defined(FAST_OSD) && !HAVE_MMX +#if defined(FAST_OSD) && !HAVE_MMX && !HAVE_SSE2 w=w>>1; #endif #if HAVE_MMX @@ -94,7 +145,28 @@ :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x]) : "%eax"); } -#else +#elif HAVE_SSE2 + __m128i zero = _mm_setzero_si128(); + for(x=0;x+8>8)+src[x]; + } +#else /* HAVE_SSE2 */ for(x=0;x> 8)) + 128 + mmlow = _mm_srai_epi16(mmdst, 8); + mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo); + mmhigh = _mm_srai_epi16(mmdst2, 8); + mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi); + + mmuv = _mm_packs_epi16(mmlow, mmhigh); + + res = _mm_unpacklo_epi8(mmy, mmuv); + res = alphamask16(mmdst, res, mmsrcalo); + // convert UV to unsigned + res = _mm_xor_si128(res, uvofs); + _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res); + + res = _mm_unpackhi_epi8(mmy, mmuv); + res = alphamask16(mmdst2, res, mmsrcahi); + // convert UV to unsigned + res = _mm_xor_si128(res, uvofs); + _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res); + } + for(;x>8)+src[x]; + dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128; + } + } +#else /* HAVE_SSE2 */ for(x=0;x> 8)) + 128 + // sign-extend and multiply + mmlow = _mm_slli_epi16(mmdst, 8); + mmlow = _mm_srai_epi16(mmlow, 8); + mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo); + mmhigh = _mm_slli_epi16(mmdst2, 8); + mmhigh = _mm_srai_epi16(mmhigh, 8); + mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi); + + mmuv = _mm_packs_epi16(mmlow, mmhigh); + + res = _mm_unpacklo_epi8(mmuv, mmy); + res = alphamask16(mmdst, res, mmsrcalo); + // convert UV to unsigned + res = _mm_xor_si128(res, uvofs); + _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res); + + res = _mm_unpackhi_epi8(mmuv, mmy); + res = alphamask16(mmdst2, res, mmsrcahi); + // convert UV to unsigned + res = _mm_xor_si128(res, uvofs); + _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res); + } + for(;x>8)+src[x]; + dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128; + } + } +#else /* HAVE_SSE2 */ for(x=0;x>8)+src[x]; + dst[1]=((dst[1]*srca[x])>>8)+src[x]; + dst[2]=((dst[2]*srca[x])>>8)+src[x]; + } + dst+=3; // 24bpp + } +#else /* HAVE_SSE2 */ for(x=0;x>8)+src[x]; + dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x]; + dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x]; + } + } +#else /* HAVE_SSE2 */ for(x=0;xbbox.x2 < obj->bbox.x1) obj->bbox.x2 = obj->bbox.x1; if (obj->bbox.y2 < obj->bbox.y1) obj->bbox.y2 = obj->bbox.y1; - obj->stride = ((obj->bbox.x2-obj->bbox.x1)+7)&(~7); + obj->stride = ((obj->bbox.x2-obj->bbox.x1)+15)&(~15); len = obj->stride*(obj->bbox.y2-obj->bbox.y1); if (obj->allocatedallocated = len;