[MPlayer-cvslog] r38143 - in trunk: configure sub/osd.c sub/osd_template.c sub/sub.c

Mon May 27 23:02:12 EEST 2019

Author: reimar
Date: Mon May 27 23:02:12 2019
New Revision: 38143

Log:
osd_template.c: Add intrinsic based SSE2 support.

Intrinsics are not well liked for good reason,
but it is much faster to implement and compilers
are no longer completely horrible for x86.
Speed is about 2x-3x and it matches the C code instead
of the less accurate MMX code.
Primary motivation is though that MMX is starting to
have worse and worse support in CPUs.
To benchmark, use this command and remove the reading code
from rawvideo and the memcpy_pic from vf_expand.
./mplayer /dev/zero -demuxer rawvideo -rawvideo w=1920:h=1080:format=yuy2 -vo null -osdlevel 3 -benchmark -quiet -frames 50000 -vf expand=osd=1 -subfont-osd-scale 20

Modified:
   trunk/configure
   trunk/sub/osd.c
   trunk/sub/osd_template.c
   trunk/sub/sub.c

Modified: trunk/configure
==============================================================================

--- trunk/configure	Sun May 26 21:56:32 2019	(r38142)
+++ trunk/configure	Mon May 27 23:02:12 2019	(r38143)
@@ -3883,6 +3883,26 @@ header_check sys/poll.h && poll_h=yes &&
   def_poll_h='#define HAVE_POLL_H 1'
 echores "$poll_h"
 
+echocheck "emmintrin.h (SSE intrinsics)"
+emmintrin_h=no
+def_emmintrin_h='#define HAVE_EMMINTRIN_H 0'
+  cat > $TMPC << EOF
+#include <emmintrin.h>
+
+__attribute__((target("sse2")))
+static int sse2test(int i) {
+    __m128i mmi = _mm_set1_epi16(i);
+    mmi = _mm_add_epi16(mmi, mmi);
+    return _mm_extract_epi16(mmi, 2);
+}
+
+int main(int argc, char **argv) {
+    return sse2test(argc);
+}
+EOF
+cc_check && emmintrin_h=yes &&
+  def_emmintrin_h='#define HAVE_EMMINTRIN_H 1'
+echores "$emmintrin_h"
 
 echocheck "inttypes.h (required)"
 _inttypes=no
@@ -9243,6 +9263,12 @@ $def_direct_h
 $def_io_h
 $def_poll_h
 $def_windows_h
+$def_emmintrin_h
+#if ARCH_X86_32
+#define ATTR_TARGET_SSE2 __attribute__((target("sse2")))
+#else
+#define ATTR_TARGET_SSE2
+#endif
 
 /* external libraries */
 $def_bzlib

Modified: trunk/sub/osd.c
==============================================================================
--- trunk/sub/osd.c	Sun May 26 21:56:32 2019	(r38142)
+++ trunk/sub/osd.c	Mon May 27 23:02:12 2019	(r38143)
@@ -31,7 +31,12 @@
 #include "libmpcodecs/img_format.h"
 #include "cpudetect.h"
 
-#if ARCH_X86
+#if !HAVE_EMMINTRIN_H
+#undef HAVE_SSE2
+#define HAVE_SSE2 0
+#endif
+
+#if ARCH_X86 && (!HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT)
 static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
 static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
 static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
@@ -39,32 +44,38 @@ static const unsigned long long mask24hl
 
 //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 //Plain C versions
-#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT
+#if (!HAVE_MMX && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_C
 #endif
 
 #if ARCH_X86
 
-#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_MMX
 #endif
 
-#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_MMX2
 #endif
 
-#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_3DNOW
 #endif
 
+#if HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_SSE2
+#endif
+
 #endif /* ARCH_X86 */
 
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 
 #if ! ARCH_X86
 
@@ -72,9 +83,11 @@ static const unsigned long long mask24hl
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _C
 #include "osd_template.c"
 #endif
@@ -87,9 +100,11 @@ static const unsigned long long mask24hl
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _X86
 #include "osd_template.c"
 #endif
@@ -100,9 +115,11 @@ static const unsigned long long mask24hl
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _MMX
 #include "osd_template.c"
 #endif
@@ -113,9 +130,11 @@ static const unsigned long long mask24hl
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 1
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _MMX2
 #include "osd_template.c"
 #endif
@@ -126,20 +145,39 @@ static const unsigned long long mask24hl
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 1
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _3DNow
 #include "osd_template.c"
 #endif
 
+//SSE2 versions
+#ifdef COMPILE_SSE2
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 1
+#define RENAME(a) a ## _SSE2
+#include <emmintrin.h>
+#include "osd_template.c"
+#endif
 #endif /* ARCH_X86 */
 
 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2)
+		vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -151,7 +189,9 @@ void vo_draw_alpha_yv12(int w,int h, uns
 		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -169,7 +209,9 @@ void vo_draw_alpha_yuy2(int w,int h, uns
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2)
+		vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -181,7 +223,9 @@ void vo_draw_alpha_yuy2(int w,int h, uns
 		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -199,7 +243,9 @@ void vo_draw_alpha_uyvy(int w,int h, uns
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2)
+		vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -211,7 +257,9 @@ void vo_draw_alpha_uyvy(int w,int h, uns
 		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -229,7 +277,9 @@ void vo_draw_alpha_rgb24(int w,int h, un
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2)
+		vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -241,7 +291,9 @@ void vo_draw_alpha_rgb24(int w,int h, un
 		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -259,7 +311,9 @@ void vo_draw_alpha_rgb32(int w,int h, un
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2)
+		vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -271,7 +325,9 @@ void vo_draw_alpha_rgb32(int w,int h, un
 		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -306,7 +362,9 @@ void vo_draw_alpha_init(void){
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 		// ordered per speed fasterst first
-		if(gCpuCaps.hasMMX2)
+		if(HAVE_EMMINTRIN_H && gCpuCaps.hasSSE2)
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using SSE2 Optimized OnScreenDisplay\n");
+		else if(gCpuCaps.hasMMX2)
 			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
 		else if(gCpuCaps.has3DNow)
 			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
@@ -318,7 +376,9 @@ void vo_draw_alpha_init(void){
 			mp_msg(MSGT_OSD,MSGL_INFO,"Using Unoptimized OnScreenDisplay\n");
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+			mp_msg(MSGT_OSD,MSGL_INFO,"Using SSE2 Optimized OnScreenDisplay\n");
+#elif HAVE_MMX2
 			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
 #elif HAVE_AMD3DNOW
 			mp_msg(MSGT_OSD,MSGL_INFO,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");

Modified: trunk/sub/osd_template.c
==============================================================================
--- trunk/sub/osd_template.c	Sun May 26 21:56:32 2019	(r38142)
+++ trunk/sub/osd_template.c	Mon May 27 23:02:12 2019	(r38143)
@@ -44,9 +44,60 @@
 #define EMMS     "emms"
 #endif
 
+#if HAVE_SSE2
+ATTR_TARGET_SSE2
+static inline __m128i muladd_src_unpacked(__m128i dstlo, __m128i dsthi, __m128i src, __m128i srcalo, __m128i srcahi)
+{
+    // (mmhigh,mmlow) = (dst * (srca * 256)) / 65536 (= (dst * srca) >> 8)
+    __m128i mmlow = _mm_mulhi_epu16(dstlo, srcalo);
+    __m128i mmhigh = _mm_mulhi_epu16(dsthi, srcahi);
+
+    __m128i res = _mm_packus_epi16(mmlow, mmhigh);
+
+    return _mm_add_epi8(res, src);
+}
+
+ATTR_TARGET_SSE2
+static inline __m128i muladd_src(__m128i dst, __m128i src, __m128i srca)
+{
+    __m128i zero = _mm_setzero_si128();
+    __m128i dstlo = _mm_unpacklo_epi8(dst, zero);
+    __m128i dsthi = _mm_unpackhi_epi8(dst, zero);
+    __m128i srcalo = _mm_unpacklo_epi8(zero, srca);
+    __m128i srcahi = _mm_unpackhi_epi8(zero, srca);
+    return muladd_src_unpacked(dstlo, dsthi, src, srcalo, srcahi);
+}
+
+ATTR_TARGET_SSE2
+static inline __m128i alphamask(__m128i orig, __m128i blended, __m128i srca)
+{
+    __m128i zero = _mm_setzero_si128();
+    // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
+    // thus no need to mask res
+    __m128i mask = _mm_cmpeq_epi8(srca, zero);
+    orig = _mm_and_si128(orig, mask);
+    return _mm_or_si128(blended, orig);
+}
+
+// Special version that compares alpha in 16 bit chunks instead of bytewise
+ATTR_TARGET_SSE2
+static inline __m128i alphamask16(__m128i orig, __m128i blended, __m128i srca)
+{
+    __m128i zero = _mm_setzero_si128();
+    // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
+    // thus no need to mask res
+    __m128i mask = _mm_cmpeq_epi16(srca, zero);
+    orig = _mm_and_si128(orig, mask);
+    return _mm_or_si128(blended, orig);
+}
+#endif
+
+#if HAVE_SSE2
+ATTR_TARGET_SSE2
+#endif
 static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
-#if defined(FAST_OSD) && !HAVE_MMX
+#if defined(FAST_OSD) && !HAVE_MMX && !HAVE_SSE2
     w=w>>1;
 #endif
 #if HAVE_MMX
@@ -94,7 +145,28 @@ static inline void RENAME(vo_draw_alpha_
 		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
 		: "%eax");
 	}
-#else
+#elif HAVE_SSE2
+        __m128i zero = _mm_setzero_si128();
+        for(x=0;x+8<w;x+=16){
+            __m128i mmsrc, mmdst, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) continue;
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + x));
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+
+            res = muladd_src(mmdst, mmsrc, mmsrca);
+
+            // _mm_maskmoveu_s128 would be an alternative but slower
+            res = alphamask(mmdst, res, mmsrca);
+            _mm_storeu_si128((__m128i *)(dstbase + x), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
+        }
+#else /* HAVE_SSE2 */
         for(x=0;x<w;x++){
 #ifdef FAST_OSD
             if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
@@ -114,6 +186,9 @@ static inline void RENAME(vo_draw_alpha_
     return;
 }
 
+#if HAVE_SSE2
+ATTR_TARGET_SSE2
+#endif
 static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 #if defined(FAST_OSD) && !HAVE_MMX
@@ -155,7 +230,7 @@ static inline void RENAME(vo_draw_alpha_
 		"psrlw	$8, %%mm0\n\t"
 		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
 		"movd %2, %%mm2\n\t"		//src 0000DCBA
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
+		"punpcklbw %%mm7, %%mm2\n\t"	//src 0D0C0B0A
 		"por %%mm1, %%mm0\n\t"
 		"paddb	%%mm2, %%mm0\n\t"
 		"movq	%%mm0, %0\n\t"
@@ -163,7 +238,56 @@ static inline void RENAME(vo_draw_alpha_
 		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
 		: "%eax");
 	}
-#else
+#elif HAVE_SSE2
+        __m128i zero = _mm_setzero_si128();
+        __m128i ymask = _mm_set1_epi16(0xff);
+        __m128i uvofs = _mm_set1_epi16(0x8000);
+        for(x=0;x+12<w;x+=16){
+            __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) continue;
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
+            mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
+
+            // convert UV to signed
+            mmdst = _mm_xor_si128(mmdst, uvofs);
+            mmdst2 = _mm_xor_si128(mmdst2, uvofs);
+
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
+            mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
+
+            mmy = muladd_src_unpacked(_mm_and_si128(mmdst, ymask), _mm_and_si128(mmdst2, ymask), mmsrc, mmsrcalo, mmsrcahi);
+
+            // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
+            mmlow = _mm_srai_epi16(mmdst, 8);
+            mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
+            mmhigh = _mm_srai_epi16(mmdst2, 8);
+            mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
+
+            mmuv = _mm_packs_epi16(mmlow, mmhigh);
+
+            res = _mm_unpacklo_epi8(mmy, mmuv);
+            res = alphamask16(mmdst, res, mmsrcalo);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
+
+            res = _mm_unpackhi_epi8(mmy, mmuv);
+            res = alphamask16(mmdst2, res, mmsrcahi);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) {
+               dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
+               dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
+            }
+        }
+#else /* HAVE_SSE2 */
         for(x=0;x<w;x++){
 #ifdef FAST_OSD
             if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
@@ -186,6 +310,9 @@ static inline void RENAME(vo_draw_alpha_
     return;
 }
 
+#if HAVE_SSE2
+ATTR_TARGET_SSE2
+#endif
 static inline void RENAME(vo_draw_alpha_uyvy)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
   int y;
 #if defined(FAST_OSD)
@@ -193,6 +320,58 @@ static inline void RENAME(vo_draw_alpha_
 #endif
   for(y=0;y<h;y++){
     register int x;
+#if HAVE_SSE2
+        __m128i zero = _mm_setzero_si128();
+        __m128i uvofs = _mm_set1_epi16(0x80);
+        for(x=0;x+12<w;x+=16){
+            __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) continue;
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
+            mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
+
+            // convert UV to signed
+            mmdst = _mm_xor_si128(mmdst, uvofs);
+            mmdst2 = _mm_xor_si128(mmdst2, uvofs);
+
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
+            mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
+
+            mmy = muladd_src_unpacked(_mm_srli_epi16(mmdst, 8), _mm_srli_epi16(mmdst2, 8), mmsrc, mmsrcalo, mmsrcahi);
+
+            // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
+            // sign-extend and multiply
+            mmlow = _mm_slli_epi16(mmdst, 8);
+            mmlow = _mm_srai_epi16(mmlow, 8);
+            mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
+            mmhigh = _mm_slli_epi16(mmdst2, 8);
+            mmhigh = _mm_srai_epi16(mmhigh, 8);
+            mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
+
+            mmuv = _mm_packs_epi16(mmlow, mmhigh);
+
+            res = _mm_unpacklo_epi8(mmuv, mmy);
+            res = alphamask16(mmdst, res, mmsrcalo);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
+
+            res = _mm_unpackhi_epi8(mmuv, mmy);
+            res = alphamask16(mmdst2, res, mmsrcahi);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) {
+	       dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
+               dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
+            }
+        }
+#else /* HAVE_SSE2 */
     for(x=0;x<w;x++){
 #ifdef FAST_OSD
       if(srca[2*x+0]) dstbase[4*x+2]=src[2*x+0];
@@ -204,12 +383,25 @@ static inline void RENAME(vo_draw_alpha_
       }
 #endif
     }
+#endif
     src+=srcstride;
     srca+=srcstride;
     dstbase+=dststride;
   }
 }
 
+#define REPL3X(out, sd1, sa1, sd2, sa2, in) \
+do { \
+   __m128i shuf012 = _mm_shufflelo_epi16(in, 0x40); \
+   __m128i shuf345 = _mm_shufflelo_epi16(in, 0xA5); \
+   __m128i repl3x_mmtmp = _mm_unpacklo_epi64(shuf012, shuf345); \
+   repl3x_mmtmp = _mm_and_si128(repl3x_mmtmp, one_in_three_mask); \
+   out = _mm_or_si128(_mm_or_si128(repl3x_mmtmp, _mm_s##sd1##li_si128(repl3x_mmtmp, sa1)), _mm_s##sd2##li_si128(repl3x_mmtmp, sa2)); \
+} while (0)
+
+#if HAVE_SSE2
+ATTR_TARGET_SSE2
+#endif
 static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 #if HAVE_MMX
@@ -221,7 +413,7 @@ static inline void RENAME(vo_draw_alpha_
     for(y=0;y<h;y++){
         register unsigned char *dst = dstbase;
         register int x;
-#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
+#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
 #if HAVE_MMX
     __asm__ volatile(
 	PREFETCHW" %0\n\t"
@@ -264,7 +456,62 @@ static inline void RENAME(vo_draw_alpha_
 		:: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
 		dst += 6;
 	}
-#else /* HAVE_MMX */
+#elif HAVE_SSE2
+        __m128i one_in_three_mask = _mm_set_epi32(0xff0000ffu, 0x0000ff00u, 0x00ff0000u, 0xff0000ffu);
+        __m128i zero = _mm_setzero_si128();
+        for(x=0;x+14<w;x+=16){
+            __m128i mmsrc, mmtmp, mmtmpa, mmdst, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) { dst += 48; continue; }
+
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+
+            if ((alpha & 0x3f) != 0x3f) {
+                mmdst = _mm_loadu_si128((const __m128i *)dst);
+                REPL3X(mmtmpa, l, 1, l, 2, mmsrca);
+                REPL3X(mmtmp, l, 1, l, 2, mmsrc);
+                res = muladd_src(mmdst, mmtmp, mmtmpa);
+                res = alphamask(mmdst, res, mmtmpa);
+                _mm_storeu_si128((__m128i *)dst, res);
+            }
+            dst += 16;
+
+            mmsrca = _mm_srli_si128(mmsrca, 5);
+            mmsrc = _mm_srli_si128(mmsrc, 5);
+
+            if ((alpha & 0x7e0) != 0x7e0) {
+                mmdst = _mm_loadu_si128((const __m128i *)dst);
+                REPL3X(mmtmpa, l, 1, r, 1, mmsrca);
+                REPL3X(mmtmp, l, 1, r, 1, mmsrc);
+                res = muladd_src(mmdst, mmtmp, mmtmpa);
+                res = alphamask(mmdst, res, mmtmpa);
+                _mm_storeu_si128((__m128i *)dst, res);
+            }
+            dst += 16;
+
+            mmsrc = _mm_srli_si128(mmsrc, 5);
+            mmsrca = _mm_srli_si128(mmsrca, 5);
+
+            if ((alpha & 0xfc00) != 0xfc00) {
+                mmdst = _mm_loadu_si128((const __m128i *)dst);
+                REPL3X(mmtmpa, r, 1, r, 2, mmsrca);
+                REPL3X(mmtmp, r, 1, r, 2, mmsrc);
+                res = muladd_src(mmdst, mmtmp, mmtmpa);
+                res = alphamask(mmdst, res, mmtmpa);
+                _mm_storeu_si128((__m128i *)dst, res);
+            }
+            dst += 16;
+        }
+        for(;x<w;x++){
+            if(srca[x]){
+		dst[0]=((dst[0]*srca[x])>>8)+src[x];
+		dst[1]=((dst[1]*srca[x])>>8)+src[x];
+		dst[2]=((dst[2]*srca[x])>>8)+src[x];
+            }
+            dst+=3; // 24bpp
+        }
+#else /* HAVE_SSE2 */
     for(x=0;x<w;x++){
         if(srca[x]){
 	    __asm__ volatile(
@@ -294,7 +541,7 @@ static inline void RENAME(vo_draw_alpha_
 	    dst += 3;
         }
 #endif /* !HAVE_MMX */
-#else /*non x86 arch or x86_64 with MMX disabled */
+#else /*non x86 arch or x86_64 with MMX and SSE2 disabled */
         for(x=0;x<w;x++){
             if(srca[x]){
 #ifdef FAST_OSD
@@ -318,6 +565,9 @@ static inline void RENAME(vo_draw_alpha_
     return;
 }
 
+#if HAVE_SSE2
+ATTR_TARGET_SSE2
+#endif
 static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 #if HAVE_BIGENDIAN
@@ -341,7 +591,7 @@ static inline void RENAME(vo_draw_alpha_
 #endif /* HAVE_MMX */
     for(y=0;y<h;y++){
         register int x;
-#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
+#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
 #if HAVE_MMX
 #if HAVE_AMD3DNOW
     __asm__ volatile(
@@ -431,7 +681,42 @@ static inline void RENAME(vo_draw_alpha_
 		: "%eax");
 	}
 #endif
-#else /* HAVE_MMX */
+#elif HAVE_SSE2
+        __m128i zero = _mm_setzero_si128();
+        __m128i mmsrca = _mm_setzero_si128();
+        for(x=0;x<w;x+=4){
+            __m128i mmsrc, mmdst, mmsrcexp, mmsrcaexp, res;
+            if ((x & 15) == 0) {
+                int alpha;
+                mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+                alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+                if (alpha == 0xffff) { x += 12; continue; }
+                mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            }
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 4*x));
+
+            mmsrcaexp = _mm_unpacklo_epi8(mmsrca, mmsrca);
+            mmsrcaexp = _mm_unpacklo_epi8(mmsrcaexp, mmsrcaexp);
+            mmsrcexp = _mm_unpacklo_epi8(mmsrc, mmsrc);
+            mmsrcexp = _mm_unpacklo_epi8(mmsrcexp, mmsrcexp);
+
+            res = muladd_src(mmdst, mmsrcexp, mmsrcaexp);
+
+            res = alphamask(mmdst, res, mmsrcaexp);
+            _mm_storeu_si128((__m128i *)(dstbase + 4*x), res);
+
+            mmsrca = _mm_srli_si128(mmsrca, 4);
+            mmsrc = _mm_srli_si128(mmsrc, 4);
+        }
+        for(;x<w;x++){
+            if(srca[x]){
+		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
+		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
+		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
+	    }
+        }
+#else /* HAVE_SSE2 */
     for(x=0;x<w;x++){
         if(srca[x]){
 	    __asm__ volatile(

Modified: trunk/sub/sub.c
==============================================================================
--- trunk/sub/sub.c	Sun May 26 21:56:32 2019	(r38142)
+++ trunk/sub/sub.c	Mon May 27 23:02:12 2019	(r38143)
@@ -152,7 +152,7 @@ static void alloc_buf(mp_osd_obj_t* obj)
     int len;
     if (obj->bbox.x2 < obj->bbox.x1) obj->bbox.x2 = obj->bbox.x1;
     if (obj->bbox.y2 < obj->bbox.y1) obj->bbox.y2 = obj->bbox.y1;
-    obj->stride = ((obj->bbox.x2-obj->bbox.x1)+7)&(~7);
+    obj->stride = ((obj->bbox.x2-obj->bbox.x1)+15)&(~15);
     len = obj->stride*(obj->bbox.y2-obj->bbox.y1);
     if (obj->allocated<len) {
 	obj->allocated = len;