[MPlayer-dev-eng] [PATCH] [RFC] Add intrinsic based SSE2 support to OSD.

Reimar Döffinger Reimar.Doeffinger at gmx.de
Sat May 18 17:39:12 EEST 2019


Unfinished.
In particular, more functions are missing and
intrinsic support needs to be auto-detected.

Intrinsics are not well liked for good reason,
but it is much faster to implement and compilers
are no longer completely horrible for x86.
Speed is about 2x and it matches the C code instead
of the less accurate MMX code.
Primary motivation is though that MMX is starting to
have worse and worse support in CPUs.
To benchmark, use this command and remove the reading code
from rawvideo and the memcpy_pic from vf_expand.
./mplayer /dev/zero -demuxer rawvideo -rawvideo w=1920:h=1080:format=yuy2 -vo null -osdlevel 3 -benchmark -quiet -frames 50000 -vf expand=osd=1 -subfont-osd-scale 20
---
 sub/osd.c          | 83 +++++++++++++++++++++++++++++++--------
 sub/osd_template.c | 96 +++++++++++++++++++++++++++++++++++++++++++++-
 sub/sub.c          |  2 +-
 3 files changed, 162 insertions(+), 19 deletions(-)

diff --git a/sub/osd.c b/sub/osd.c
index a2d002526..14906eca9 100644
--- a/sub/osd.c
+++ b/sub/osd.c
@@ -31,40 +31,46 @@
 #include "libmpcodecs/img_format.h"
 #include "cpudetect.h"

-#if ARCH_X86
+//#if ARCH_X86 && (!HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT)
 static const uint64_t bFF __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
 static const unsigned long long mask24lh  __attribute__((aligned(8))) = 0xFFFF000000000000ULL;
 static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL;
-#endif
+//#endif

 //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 //Plain C versions
-#if !HAVE_MMX || CONFIG_RUNTIME_CPUDETECT
+#if (!HAVE_MMX && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_C
 #endif

 #if ARCH_X86

-#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_MMX
 #endif

-#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_MMX2
 #endif

-#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
+#if (HAVE_AMD3DNOW && !HAVE_MMX2 && !HAVE_SSE2) || CONFIG_RUNTIME_CPUDETECT
 #define COMPILE_3DNOW
 #endif

+#if HAVE_SSE2 || CONFIG_RUNTIME_CPUDETECT
+#define COMPILE_SSE2
+#endif
+
 #endif /* ARCH_X86 */

 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0

 #if ! ARCH_X86

@@ -72,9 +78,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _C
 #include "osd_template.c"
 #endif
@@ -87,9 +95,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 0
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _X86
 #include "osd_template.c"
 #endif
@@ -100,9 +110,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _MMX
 #include "osd_template.c"
 #endif
@@ -113,9 +125,11 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 1
 #define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _MMX2
 #include "osd_template.c"
 #endif
@@ -126,20 +140,39 @@ static const unsigned long long mask24hl  __attribute__((aligned(8))) = 0x0000FF
 #undef HAVE_MMX
 #undef HAVE_MMX2
 #undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
 #define HAVE_MMX 1
 #define HAVE_MMX2 0
 #define HAVE_AMD3DNOW 1
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _3DNow
 #include "osd_template.c"
 #endif

+//SSE2 versions
+#ifdef COMPILE_SSE2
+#undef RENAME
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#undef HAVE_SSE2
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 1
+#define RENAME(a) a ## _SSE2
+#include <emmintrin.h>
+#include "osd_template.c"
+#endif
 #endif /* ARCH_X86 */

 void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -151,7 +184,9 @@ void vo_draw_alpha_yv12(int w,int h, unsigned char* src, unsigned char *srca, in
 		vo_draw_alpha_yv12_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_yv12_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_yv12_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_yv12_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -169,7 +204,9 @@ void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, in
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -181,7 +218,9 @@ void vo_draw_alpha_yuy2(int w,int h, unsigned char* src, unsigned char *srca, in
 		vo_draw_alpha_yuy2_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_yuy2_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_yuy2_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_yuy2_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -199,7 +238,9 @@ void vo_draw_alpha_uyvy(int w,int h, unsigned char* src, unsigned char *srca, in
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -211,7 +252,9 @@ void vo_draw_alpha_uyvy(int w,int h, unsigned char* src, unsigned char *srca, in
 		vo_draw_alpha_uyvy_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_uyvy_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_uyvy_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_uyvy_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -229,7 +272,9 @@ void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, i
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -241,7 +286,9 @@ void vo_draw_alpha_rgb24(int w,int h, unsigned char* src, unsigned char *srca, i
 		vo_draw_alpha_rgb24_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_rgb24_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_rgb24_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_rgb24_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -259,7 +306,9 @@ void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, i
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86
 	// ordered by speed / fastest first
-	if(gCpuCaps.hasMMX2)
+	if(gCpuCaps.hasSSE2)
+		vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+	else if(gCpuCaps.hasMMX2)
 		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 	else if(gCpuCaps.has3DNow)
 		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
@@ -271,7 +320,9 @@ void vo_draw_alpha_rgb32(int w,int h, unsigned char* src, unsigned char *srca, i
 		vo_draw_alpha_rgb32_C(w, h, src, srca, srcstride, dstbase, dststride);
 #endif
 #else //CONFIG_RUNTIME_CPUDETECT
-#if HAVE_MMX2
+#if HAVE_SSE2
+		vo_draw_alpha_rgb32_SSE2(w, h, src, srca, srcstride, dstbase, dststride);
+#elif HAVE_MMX2
 		vo_draw_alpha_rgb32_MMX2(w, h, src, srca, srcstride, dstbase, dststride);
 #elif HAVE_AMD3DNOW
 		vo_draw_alpha_rgb32_3DNow(w, h, src, srca, srcstride, dstbase, dststride);
diff --git a/sub/osd_template.c b/sub/osd_template.c
index 6d8305a3c..9ef2ada46 100644
--- a/sub/osd_template.c
+++ b/sub/osd_template.c
@@ -24,6 +24,9 @@
 #undef PREFETCHW
 #undef PAVGB

+//#define HAVE_MMX 1
+//#define HAVE_MMX2 1
+
 #if HAVE_AMD3DNOW
 #define PREFETCH  "prefetch"
 #define PREFETCHW "prefetchw"
@@ -46,7 +49,7 @@

 static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
     int y;
-#if defined(FAST_OSD) && !HAVE_MMX
+#if defined(FAST_OSD) && !HAVE_MMX && !HAVE_SSE2
     w=w>>1;
 #endif
 #if HAVE_MMX
@@ -94,6 +97,34 @@ static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, u
 		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
 		: "%eax");
 	}
+#elif HAVE_SSE2
+        __m128i zero = _mm_set1_epi8(0);
+        for(x=0;x+8<w;x+=16){
+            __m128i mmsrc, mmsrca, mmdst, mmlow, mmhigh, premult, res, mask;
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + x));
+            mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+
+            // (mmhigh,mmlow) = (dst * (srca * 256)) / 65536 (= (dst * srca) >> 8)
+            mmlow = _mm_mulhi_epu16(_mm_unpacklo_epi8(mmdst, zero), _mm_unpacklo_epi8(zero, mmsrca));
+            mmhigh = _mm_mulhi_epu16(_mm_unpackhi_epi8(mmdst, zero), _mm_unpackhi_epi8(zero, mmsrca));
+
+            premult = _mm_packus_epi16(mmlow, mmhigh);
+
+            // += src
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            res = _mm_add_epi8(premult, mmsrc);
+
+            // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
+            // thus no need to mask res
+            // _mm_maskmoveu_s128 would be an alternative but slower
+            mask = _mm_cmpeq_epi8(mmsrca, zero);
+            mmdst = _mm_and_si128(mmdst, mask);
+            res = _mm_or_si128(res, mmdst);
+            _mm_storeu_si128((__m128i *)(dstbase + x), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
+        }
 #else
         for(x=0;x<w;x++){
 #ifdef FAST_OSD
@@ -155,7 +186,7 @@ static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, u
 		"psrlw	$8, %%mm0\n\t"
 		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
 		"movd %2, %%mm2\n\t"		//src 0000DCBA
-		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
+		"punpcklbw %%mm7, %%mm2\n\t"	//src 0D0C0B0A
 		"por %%mm1, %%mm0\n\t"
 		"paddb	%%mm2, %%mm0\n\t"
 		"movq	%%mm0, %0\n\t"
@@ -163,6 +194,67 @@ static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, u
 		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
 		: "%eax");
 	}
+#elif HAVE_SSE2
+        __m128i zero = _mm_set1_epi8(0);
+        __m128i ymask = _mm_set1_epi16(0xff);
+        __m128i uvofs = _mm_set1_epi16(0x8000);
+        for(x=0;x+12<w;x+=16){
+            __m128i mmsrc, mmsrca, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, premult, res, mask;
+            mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
+            mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 8));
+
+            // convert UV to signed
+            mmdst = _mm_xor_si128(mmdst, uvofs);
+            mmdst2 = _mm_xor_si128(mmdst2, uvofs);
+
+            mmsrca = _mm_load_si128((const __m128i *)(srca + x));
+            mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
+            mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
+
+            // (mmhigh,mmlow) = (dst(y) * (srca * 256)) / 65536 (= (dst * srca) >> 8)
+            mmlow = _mm_mulhi_epu16(_mm_and_si128(mmdst, ymask), mmsrcalo);
+            mmhigh = _mm_mulhi_epu16(_mm_and_si128(mmdst2, ymask), mmsrcahi);
+
+            mmy = _mm_packus_epi16(mmlow, mmhigh);
+
+            // += src
+            mmsrc = _mm_load_si128((const __m128i *)(src + x));
+            mmy = _mm_add_epi8(mmy, mmsrc);
+
+            // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
+            mmlow = _mm_srai_epi16(mmdst, 8);
+            mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
+            mmhigh = _mm_srai_epi16(mmdst2, 8);
+            mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
+
+            mmuv = _mm_packs_epi16(mmlow, mmhigh);
+
+            // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
+            // thus no need to mask mmy.
+            // Since mmuv is still signed the same applies to it.
+            // _mm_maskmoveu_s128 would be an alternative but slower
+            mask = _mm_cmpeq_epi16(mmsrcalo, zero);
+            mmdst = _mm_and_si128(mmdst, mask);
+            res = _mm_unpacklo_epi8(mmy, mmuv);
+            res = _mm_or_si128(res, mmdst);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
+
+            mask = _mm_cmpeq_epi16(mmsrcahi, zero);
+            mmdst2 = _mm_and_si128(mmdst2, mask);
+            res = _mm_unpackhi_epi8(mmy, mmuv);
+            res = _mm_or_si128(res, mmdst2);
+            // convert UV to unsigned
+            res = _mm_xor_si128(res, uvofs);
+            _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 8), res);
+        }
+        for(;x<w;x++){
+            if(srca[x]) {
+               dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
+               dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
+            }
+        }
 #else
         for(x=0;x<w;x++){
 #ifdef FAST_OSD
diff --git a/sub/sub.c b/sub/sub.c
index 86f7fffad..0d285e134 100644
--- a/sub/sub.c
+++ b/sub/sub.c
@@ -152,7 +152,7 @@ static void alloc_buf(mp_osd_obj_t* obj)
     int len;
     if (obj->bbox.x2 < obj->bbox.x1) obj->bbox.x2 = obj->bbox.x1;
     if (obj->bbox.y2 < obj->bbox.y1) obj->bbox.y2 = obj->bbox.y1;
-    obj->stride = ((obj->bbox.x2-obj->bbox.x1)+7)&(~7);
+    obj->stride = ((obj->bbox.x2-obj->bbox.x1)+15)&(~15);
     len = obj->stride*(obj->bbox.y2-obj->bbox.y1);
     if (obj->allocated<len) {
 	obj->allocated = len;
--
2.20.1



More information about the MPlayer-dev-eng mailing list