[MPlayer-dev-eng] [PATCH 4/6] libvo: optimize fastmemcpy for loongson with mmi
周晓勇
zhouxiaoyong at loongson.cn
Wed Sep 2 12:07:10 CEST 2015
>From b2709e8c8034977b21b239aaf41f291878b8199c Mon Sep 17 00:00:00 2001
From: ZhouXiaoyong <zhouxiaoyong at loongson.cn>
Date: Thu, 20 Aug 2015 17:26:57 +0800
Subject: [PATCH 4/6] libvo: optimize fastmemcpy for loongson with mmi
Signed-off-by: ZhouXiaoyong <zhouxiaoyong at loongson.cn>
---
libvo/aclib.c | 12 +++++-
libvo/aclib_template.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++
libvo/fastmemcpy.h | 2 +-
3 files changed, 120 insertions(+), 3 deletions(-)
diff --git a/libvo/aclib.c b/libvo/aclib.c
index 17f0f8c..b21d515 100644
--- a/libvo/aclib.c
+++ b/libvo/aclib.c
@@ -155,6 +155,10 @@
#endif /* ARCH_X86 */
+#if ARCH_LOONGSON
+#define RENAME(a) a ## _MMI
+#include "aclib_template.c"
+#endif
#undef fast_memcpy
void * fast_memcpy(void * to, const void * from, size_t len)
@@ -174,7 +178,9 @@ void * fast_memcpy(void * to, const void * from, size_t len)
#endif
memcpy(to, from, len); // prior to mmx we use the standart memcpy
#else
-#if HAVE_SSE2
+#if ARCH_LOONGSON
+fast_memcpy_MMI(to, from, len);
+#elif HAVE_SSE2
fast_memcpy_SSE(to, from, len);
#elif HAVE_MMX2
fast_memcpy_MMX2(to, from, len);
@@ -208,7 +214,9 @@ void * mem2agpcpy(void * to, const void * from, size_t len)
#endif
memcpy(to, from, len); // prior to mmx we use the standart memcpy
#else
-#if HAVE_SSE2
+#if ARCH_LOONGSON
+mem2agpcpy_MMI(to, from, len);
+#elif HAVE_SSE2
mem2agpcpy_SSE(to, from, len);
#elif HAVE_MMX2
mem2agpcpy_MMX2(to, from, len);
diff --git a/libvo/aclib_template.c b/libvo/aclib_template.c
index 68c9313..92c11f8 100644
--- a/libvo/aclib_template.c
+++ b/libvo/aclib_template.c
@@ -100,6 +100,10 @@ If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
#define HAVE_K6_2PLUS
#endif
+#if ARCH_LOONGSON
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy memcpy
+#else
/* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)\
{\
@@ -113,10 +117,13 @@ __asm__ volatile(\
:"0" (to), "1" (from),"2" (n)\
: "memory");\
}
+#endif
#undef MMREG_SIZE
#if HAVE_SSE
#define MMREG_SIZE 16
+#elif ARCH_LOONGSON
+#define MMREG_SIZE 8
#else
#define MMREG_SIZE 64 //8
#endif
@@ -149,10 +156,61 @@ __asm__ volatile(\
#undef MIN_LEN
#ifdef HAVE_ONLY_MMX1
#define MIN_LEN 0x800 /* 2K blocks */
+#elif ARCH_LOONGSON
+#define MIN_LEN 0x40
#else
#define MIN_LEN 0x40 /* 64-byte blocks */
#endif
+#if ARCH_LOONGSON
+static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
+{
+ void *retval;
+ size_t i;
+
+ retval = to;
+
+__asm__ volatile (
+ "ld $0, (%0) \n"
+ "ld $0, 64(%0) \n"
+ "ld $0, 128(%0) \n"
+ "ld $0, 192(%0) \n"
+ "ld $0, 256(%0) \n"
+::"r"(from)
+ );
+
+ if (len >= MIN_LEN && (((intptr_t)from) & 15) ==0
+ && (((intptr_t)to ) & 15) ==0) {
+ i = len >> 6; /* len/64 */
+ len &= 63;
+
+ for (; i>0; i--) {
+ __asm__ volatile (
+ "ld $0, 320(%[from]) \n"
+ "gslqc1 $f0, $f2, 0(%[from]) \n"
+ "gslqc1 $f4, $f6, 16(%[from]) \n"
+ "gslqc1 $f8, $f10, 32(%[from]) \n"
+ "gslqc1 $f12, $f14, 48(%[from]) \n"
+ "gssqc1 $f0, $f2, 0(%[to]) \n"
+ "gssqc1 $f4, $f6, 16(%[to]) \n"
+ "gssqc1 $f8, $f10, 32(%[to]) \n"
+ "gssqc1 $f12, $f14, 48(%[to]) \n"
+ ::[from]"r"(from),[to]"r"(to)
+ : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+ );
+ from = ((const unsigned char *)from)+64;
+ to = ((unsigned char *)to)+64;
+ }
+ }
+ /*
+ *Now do the tail of the block
+ */
+ if (len) small_memcpy(to, from, len);
+ return retval;
+}
+
+#else /* ARCH_LOONGSON */
+
static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
{
void *retval;
@@ -379,10 +437,60 @@ static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
if(len) small_memcpy(to, from, len);
return retval;
}
+#endif /* ARCH_LOONGSON */
/**
* special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
*/
+#if ARCH_LOONGSON
+static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
+{
+ void *retval;
+ size_t i;
+
+ retval = to;
+
+__asm__ volatile (
+ "ld $0, (%0) \n"
+ "ld $0, 64(%0) \n"
+ "ld $0, 128(%0) \n"
+ "ld $0, 192(%0) \n"
+ "ld $0, 256(%0) \n"
+::"r"(from)
+ );
+
+ if (len >= MIN_LEN && (((intptr_t)from) & 15) ==0
+ && (((intptr_t)to ) & 15) ==0) {
+ i = len >> 6; /* len/64 */
+ len &= 63;
+
+ for (; i>0; i--) {
+ __asm__ volatile (
+ "ld $0, 320(%[from]) \n"
+ "gslqc1 $f0, $f2, 0(%[from]) \n"
+ "gslqc1 $f4, $f6, 16(%[from]) \n"
+ "gslqc1 $f8, $f10, 32(%[from]) \n"
+ "gslqc1 $f12, $f14, 48(%[from]) \n"
+ "gssqc1 $f0, $f2, 0(%[to]) \n"
+ "gssqc1 $f4, $f6, 16(%[to]) \n"
+ "gssqc1 $f8, $f10, 32(%[to]) \n"
+ "gssqc1 $f12, $f14, 48(%[to]) \n"
+ ::[from]"r"(from),[to]"r"(to)
+ : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+ );
+ from = ((const unsigned char *)from)+64;
+ to = ((unsigned char *)to)+64;
+ }
+ }
+ /*
+ *Now do the tail of the block
+ */
+ if (len) small_memcpy(to, from, len);
+ return retval;
+}
+
+#else /* ARCH_LOONGSON */
+
static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
{
void *retval;
@@ -461,3 +569,4 @@ static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
if(len) small_memcpy(to, from, len);
return retval;
}
+#endif /* ARCH_LOONGSON */
diff --git a/libvo/fastmemcpy.h b/libvo/fastmemcpy.h
index 094d3ac..89ab04e 100644
--- a/libvo/fastmemcpy.h
+++ b/libvo/fastmemcpy.h
@@ -27,7 +27,7 @@
void * fast_memcpy(void * to, const void * from, size_t len);
void * mem2agpcpy(void * to, const void * from, size_t len);
-#if ! defined(CONFIG_FASTMEMCPY) || ! (HAVE_MMX || HAVE_MMX2 || HAVE_AMD3DNOW /* || HAVE_SSE || HAVE_SSE2 */)
+#if ! defined(CONFIG_FASTMEMCPY) || ! (ARCH_LOONGSON || HAVE_MMX || HAVE_MMX2 || HAVE_AMD3DNOW /* || HAVE_SSE || HAVE_SSE2 */)
#define mem2agpcpy(a,b,c) memcpy(a,b,c)
#define fast_memcpy(a,b,c) memcpy(a,b,c)
#endif
--
2.1.4
More information about the MPlayer-dev-eng
mailing list