[MPlayer-dev-eng] [PATCH] [RFC] Add intrinsic based SSE2 support to OSD.

Reimar Döffinger Reimar.Doeffinger at gmx.de
Mon May 20 01:48:19 EEST 2019


Unfinished.
In particular intrinsic support needs to be auto-detected.

Intrinsics are not well liked for good reason,
but it is much faster to implement and compilers
are no longer completely horrible for x86.
Speed is about 2x-3x and it matches the C code instead
of the less accurate MMX code.
Primary motivation is though that MMX is starting to
have worse and worse support in CPUs.
To benchmark, use this command and remove the reading code
from rawvideo and the memcpy_pic from vf_expand.
./mplayer /dev/zero -demuxer rawvideo -rawvideo w=1920:h=1080:format=yuy2 -vo null -osdlevel 3 -benchmark -quiet -frames 50000 -vf expand=osd=1 -subfont-osd-scale 20
---
 sub/osd.c          |  81 +++++++++++++----
 sub/osd_template.c | 220 +++++++++++++++++++++++++++++++++++++++++++--
 sub/sub.c          |   2 +-
 3 files changed, 278 insertions(+), 25 deletions(-)

diff --git a/sub/osd.c b/sub/osd.c
index a2d002526..f8153bdd5 100644
--- a/sub/osd.c
+++ b/sub/osd.c
@@ -31,7 +31,7 @@
 #include "libmpcodecs/img_format.h"
 #include "cpudetect.h"

-#if ARCH_X86
+#if ARCH_X86 && (!HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT)
 static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
 static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
 static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
@@ -39,32 +39,38 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF

 //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 //Plain C versions
-#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT
+#if (!HAVE_MMX && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_C
 #endif

 #if ARCH_X86

-#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_MMX
 #endif

-#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_MMX2
 #endif

-#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_3DNOW
 #endif

+#if HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_SSE2
+#endif
+
 #endif /* ARCH_X86 */

 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0

 #if ! ARCH_X86

@@ -72,9 +78,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _C
 #include "osd_template.c"
 #endif
@@ -87,9 +95,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _X86
 #include "osd_template.c"
 #endif
@@ -100,9 +110,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _MMX
 #include "osd_template.c"
 #endif
@@ -113,9 +125,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 1
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _MMX2
 #include "osd_template.c"
 #endif
@@ -126,20 +140,39 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 1
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _3DNow
 #include "osd_template.c"
 #endif

+//SSE2 versions
+#ifdef COMPILE_SSE2
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 1
+#define RENAME(a) a ## _SSE2
+#include <emmintrin.h>
+#include "osd_template.c"
+#endif
 #endif /* ARCH_X86 */

 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -151,7 +184,9 @@ void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, in
 		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -169,7 +204,9 @@ void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, in
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -181,7 +218,9 @@ void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, in
 		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -199,7 +238,9 @@ void vo_draw_alpha_uyvy(int w,int h, unsigned char* src, unsigned char *srca, in
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -211,7 +252,9 @@ void vo_draw_alpha_uyvy(int w,int h, unsigned char* src, unsigned char *srca, in
 		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -229,7 +272,9 @@ void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, i
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -241,7 +286,9 @@ void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, i
 		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -259,7 +306,9 @@ void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, i
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -271,7 +320,9 @@ void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, i
 		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
diff --git a/sub/osd_template.c b/sub/osd_template.c
index 6d8305a3c..f361d3f5d 100644
--- a/sub/osd_template.c
+++ b/sub/osd_template.c
@@ -44,9 +44,42 @@
 #define EMMS     "emms"
 #endif

+#if HAVE_SSE2
+static inline __m128i muladd_src_unpacked(__m128i dstlo, __m128i dsthi, __m128i src, __m128i srcalo, __m128i srcahi)
+{
+    // (mmhigh,mmlow) = (dst * (srca * 256)) / 65536 (= (dst * srca) >> 8)
+    __m128i mmlow = _mm_mulhi_epu16(dstlo, srcalo);
+    __m128i mmhigh = _mm_mulhi_epu16(dsthi, srcahi);
+
+    __m128i res = _mm_packus_epi16(mmlow, mmhigh);
+
+    return _mm_add_epi8(res, src);
+}
+
+static inline __m128i muladd_src(__m128i dst, __m128i src, __m128i srca)
+{
+    __m128i zero = _mm_set1_epi8(0);
+    __m128i dstlo = _mm_unpacklo_epi8(dst, zero);
+    __m128i dsthi = _mm_unpackhi_epi8(dst, zero);
+    __m128i srcalo = _mm_unpacklo_epi8(zero, srca);
+    __m128i srcahi = _mm_unpackhi_epi8(zero, srca);
+    return muladd_src_unpacked(dstlo, dsthi, src, srcalo, srcahi);
+}
+
+static inline __m128i alphamask(__m128i orig, __m128i blended, __m128i srca)
+{
+    __m128i zero = _mm_set1_epi8(0);
+    // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
+    // thus no need to mask res
+    __m128i mask = _mm_cmpeq_epi8(srca, zero);
+    orig = _mm_and_si128(orig, mask);
+    return _mm_or_si128(blended, orig);
+}
+#endif
+
 static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
-#if defined(FAST_OSD) && !HAVE_MMX
+#if defined(FAST_OSD) && !HAVE_MMX && !HAVE_SSE2
     w=w>>1;
 #endif
 #if HAVE_MMX
@@ -94,7 +127,28 @@ static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, u
 		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
 		: "%eax");
 	}
-#else
+#elif HAVE_SSE2
+        __m128i zero = _mm_set1_epi8(0);
+        for(x=0;x+8<w;x+=16){
+            __m128i mmsrc, mmdst, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) continue;
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + x));
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+
+            res = muladd_src(mmdst, mmsrc, mmsrca);
+
+            // _mm_maskmoveu_s128 would be an alternative but slower
+            res = alphamask(mmdst, res, mmsrca);
+            _mm_storeu_si128((__m128i *)(dstbase + x), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
+        }
+#else /* HAVE_SSE2 */
         for(x=0;x<w;x++){
 #ifdef FAST_OSD
             if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
@@ -155,7 +209,7 @@ static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, u
 		"psrlw	$8, %%mm0\n\t"
 		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
 		"movd %2, %%mm2\n\t"		//src 0000DCBA
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
+		"punpcklbw %%mm7, %%mm2\n\t"	//src 0D0C0B0A
 		"por %%mm1, %%mm0\n\t"
 		"paddb	%%mm2, %%mm0\n\t"
 		"movq	%%mm0, %0\n\t"
@@ -163,7 +217,56 @@ static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, u
 		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
 		: "%eax");
 	}
-#else
+#elif HAVE_SSE2
+        __m128i zero = _mm_set1_epi8(0);
+        __m128i ymask = _mm_set1_epi16(0xff);
+        __m128i uvofs = _mm_set1_epi16(0x8000);
+        for(x=0;x+12<w;x+=16){
+            __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) continue;
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
+            mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
+
+            // convert UV to signed
+            mmdst = _mm_xor_si128(mmdst, uvofs);
+            mmdst2 = _mm_xor_si128(mmdst2, uvofs);
+
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
+            mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
+
+            mmy = muladd_src_unpacked(_mm_and_si128(mmdst, ymask), _mm_and_si128(mmdst2, ymask), mmsrc, mmsrcalo, mmsrcahi);
+
+            // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
+            mmlow = _mm_srai_epi16(mmdst, 8);
+            mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
+            mmhigh = _mm_srai_epi16(mmdst2, 8);
+            mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
+
+            mmuv = _mm_packs_epi16(mmlow, mmhigh);
+
+            res = _mm_unpacklo_epi8(mmy, mmuv);
+            res = alphamask(mmdst, res, mmsrcalo);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
+
+            res = _mm_unpackhi_epi8(mmy, mmuv);
+            res = alphamask(mmdst2, res, mmsrcalo);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) {
+               dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
+               dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
+            }
+        }
+#else /* HAVE_SSE2 */
         for(x=0;x<w;x++){
 #ifdef FAST_OSD
             if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
@@ -210,6 +313,15 @@ static inline void RENAME(vo_draw_alpha_uyvy)(int w,int h, unsigned char* src, u
   }
 }

+#define REPL3X(out, sd1, sa1, sd2, sa2, in) \
+do { \
+   __m128i shuf012 = _mm_shufflelo_epi16(in, 0x40); \
+   __m128i shuf345 = _mm_shufflelo_epi16(in, 0xA5); \
+   __m128i repl3x_mmtmp = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(shuf012), _mm_castsi128_ps(shuf345))); \
+   repl3x_mmtmp = _mm_and_si128(repl3x_mmtmp, one_in_three_mask); \
+   out = _mm_or_si128(_mm_or_si128(repl3x_mmtmp, _mm_s##sd1##li_si128(repl3x_mmtmp, sa1)), _mm_s##sd2##li_si128(repl3x_mmtmp, sa2)); \
+} while (0)
+
 static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
 #if HAVE_MMX
@@ -221,7 +333,7 @@ static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src,
     for(y=0;y<h;y++){
         register unsigned char *dst = dstbase;
         register int x;
-#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
+#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
 #if HAVE_MMX
     __asm__ volatile(
 	PREFETCHW" %0\n\t"
@@ -264,7 +376,62 @@ static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src,
 		:: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
 		dst += 6;
 	}
-#else /* HAVE_MMX */
+#elif HAVE_SSE2
+        __m128i one_in_three_mask = _mm_set_epi32(0xff0000ffu, 0x0000ff00u, 0x00ff0000u, 0xff0000ffu);
+        __m128i zero = _mm_set1_epi8(0);
+        for(x=0;x+14<w;x+=16){
+            __m128i mmsrc, mmtmp, mmtmpa, mmdst, res;
+            __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+            int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+            if (alpha == 0xffff) { dst += 48; continue; }
+
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+
+            if ((alpha & 0x3f) != 0x3f) {
+                mmdst = _mm_loadu_si128((const __m128i *)dst);
+                REPL3X(mmtmpa, l, 1, l, 2, mmsrca);
+                REPL3X(mmtmp, l, 1, l, 2, mmsrc);
+                res = muladd_src(mmdst, mmtmp, mmtmpa);
+                res = alphamask(mmdst, res, mmtmpa);
+                _mm_storeu_si128((__m128i *)dst, res);
+            }
+            dst += 16;
+
+            mmsrca = _mm_srli_si128(mmsrca, 5);
+            mmsrc = _mm_srli_si128(mmsrc, 5);
+
+            if ((alpha & 0x7e0) != 0x7e0) {
+                mmdst = _mm_loadu_si128((const __m128i *)dst);
+                REPL3X(mmtmpa, l, 1, r, 1, mmsrca);
+                REPL3X(mmtmp, l, 1, r, 1, mmsrc);
+                res = muladd_src(mmdst, mmtmp, mmtmpa);
+                res = alphamask(mmdst, res, mmtmpa);
+                _mm_storeu_si128((__m128i *)dst, res);
+            }
+            dst += 16;
+
+            mmsrc = _mm_srli_si128(mmsrc, 5);
+            mmsrca = _mm_srli_si128(mmsrca, 5);
+
+            if ((alpha & 0xfc00) != 0xfc00) {
+                mmdst = _mm_loadu_si128((const __m128i *)dst);
+                REPL3X(mmtmpa, r, 1, r, 2, mmsrca);
+                REPL3X(mmtmp, r, 1, r, 2, mmsrc);
+                res = muladd_src(mmdst, mmtmp, mmtmpa);
+                res = alphamask(mmdst, res, mmtmpa);
+                _mm_storeu_si128((__m128i *)dst, res);
+            }
+            dst += 16;
+        }
+        for(;x<w;x++){
+            if(srca[x]){
+		dst[0]=((dst[0]*srca[x])>>8)+src[x];
+		dst[1]=((dst[1]*srca[x])>>8)+src[x];
+		dst[2]=((dst[2]*srca[x])>>8)+src[x];
+            }
+            dst+=3; // 24bpp
+        }
+#else /* HAVE_SSE2 */
     for(x=0;x<w;x++){
         if(srca[x]){
 	    __asm__ volatile(
@@ -294,7 +461,7 @@ static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src,
 	    dst += 3;
         }
 #endif /* !HAVE_MMX */
-#else /*non x86 arch or x86_64 with MMX disabled */
+#else /*non x86 arch or x86_64 with MMX and SSE2 disabled */
         for(x=0;x<w;x++){
             if(srca[x]){
 #ifdef FAST_OSD
@@ -341,7 +508,7 @@ static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src,
 #endif /* HAVE_MMX */
     for(y=0;y<h;y++){
         register int x;
-#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
+#if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
 #if HAVE_MMX
 #if HAVE_AMD3DNOW
     __asm__ volatile(
@@ -431,7 +598,42 @@ static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src,
 		: "%eax");
 	}
 #endif
-#else /* HAVE_MMX */
+#elif HAVE_SSE2
+        __m128i zero = _mm_set1_epi8(0);
+        __m128i mmsrca = zero;
+        for(x=0;x<w;x+=4){
+            __m128i mmsrc, mmdst, mmsrcexp, mmsrcaexp, res;
+            if ((x & 15) == 0) {
+                int alpha;
+                mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+                alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
+                if (alpha == 0xffff) { x += 12; continue; }
+                mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            }
+
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 4*x));
+
+            mmsrcaexp = _mm_unpacklo_epi8(mmsrca, mmsrca);
+            mmsrcaexp = _mm_unpacklo_epi8(mmsrcaexp, mmsrcaexp);
+            mmsrcexp = _mm_unpacklo_epi8(mmsrc, mmsrc);
+            mmsrcexp = _mm_unpacklo_epi8(mmsrcexp, mmsrcexp);
+
+            res = muladd_src(mmdst, mmsrcexp, mmsrcaexp);
+
+            res = alphamask(mmdst, res, mmsrcaexp);
+            _mm_storeu_si128((__m128i *)(dstbase + 4*x), res);
+
+            mmsrca = _mm_srli_si128(mmsrca, 4);
+            mmsrc = _mm_srli_si128(mmsrc, 4);
+        }
+        for(;x<w;x++){
+            if(srca[x]){
+		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
+		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
+		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
+	    }
+        }
+#else /* HAVE_SSE2 */
     for(x=0;x<w;x++){
         if(srca[x]){
 	    __asm__ volatile(
diff --git a/sub/sub.c b/sub/sub.c
index 86f7fffad..0d285e134 100644
--- a/sub/sub.c
+++ b/sub/sub.c
@@ -152,7 +152,7 @@ static void alloc_buf(mp_osd_obj_t* obj)
     int len;
     if (obj->bbox.x2 < obj->bbox.x1) obj->bbox.x2 = obj->bbox.x1;
     if (obj->bbox.y2 < obj->bbox.y1) obj->bbox.y2 = obj->bbox.y1;
-    obj->stride = ((obj->bbox.x2-obj->bbox.x1)+7)&(~7);
+    obj->stride = ((obj->bbox.x2-obj->bbox.x1)+15)&(~15);
     len = obj->stride*(obj->bbox.y2-obj->bbox.y1);
     if (obj->allocated<len) {
 	obj->allocated = len;
--
2.20.1



More information about the MPlayer-dev-eng mailing list