[MPlayer-dev-eng] [PATCH 4/6] libvo: optimize fastmemcpy for loongson with mmi

周晓勇 zhouxiaoyong at loongson.cn
Wed Sep 2 12:07:10 CEST 2015


>From b2709e8c8034977b21b239aaf41f291878b8199c Mon Sep 17 00:00:00 2001
From: ZhouXiaoyong <zhouxiaoyong at loongson.cn>
Date: Thu, 20 Aug 2015 17:26:57 +0800
Subject: [PATCH 4/6] libvo: optimize fastmemcpy for loongson with mmi


Signed-off-by: ZhouXiaoyong <zhouxiaoyong at loongson.cn>
---
 libvo/aclib.c          |  12 +++++-
 libvo/aclib_template.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++
 libvo/fastmemcpy.h     |   2 +-
 3 files changed, 120 insertions(+), 3 deletions(-)


diff --git a/libvo/aclib.c b/libvo/aclib.c
index 17f0f8c..b21d515 100644
--- a/libvo/aclib.c
+++ b/libvo/aclib.c
@@ -155,6 +155,10 @@
 
 #endif /* ARCH_X86 */
 
+#if ARCH_LOONGSON
+#define RENAME(a) a ## _MMI
+#include "aclib_template.c"
+#endif
 
 #undef fast_memcpy
 void * fast_memcpy(void * to, const void * from, size_t len)
@@ -174,7 +178,9 @@ void * fast_memcpy(void * to, const void * from, size_t len)
 #endif
 memcpy(to, from, len); // prior to mmx we use the standart memcpy
 #else
-#if HAVE_SSE2
+#if ARCH_LOONGSON
+fast_memcpy_MMI(to, from, len);
+#elif HAVE_SSE2
 fast_memcpy_SSE(to, from, len);
 #elif HAVE_MMX2
 fast_memcpy_MMX2(to, from, len);
@@ -208,7 +214,9 @@ void * mem2agpcpy(void * to, const void * from, size_t len)
 #endif
 memcpy(to, from, len); // prior to mmx we use the standart memcpy
 #else
-#if HAVE_SSE2
+#if ARCH_LOONGSON
+mem2agpcpy_MMI(to, from, len);
+#elif HAVE_SSE2
 mem2agpcpy_SSE(to, from, len);
 #elif HAVE_MMX2
 mem2agpcpy_MMX2(to, from, len);
diff --git a/libvo/aclib_template.c b/libvo/aclib_template.c
index 68c9313..92c11f8 100644
--- a/libvo/aclib_template.c
+++ b/libvo/aclib_template.c
@@ -100,6 +100,10 @@ If you have questions please contact with me: Nick Kurshev: nickols_k at mail.ru.
 #define HAVE_K6_2PLUS
 #endif
 
+#if ARCH_LOONGSON
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy memcpy
+#else
 /* for small memory blocks (<256 bytes) this version is faster */
 #define small_memcpy(to,from,n)\
 {\
@@ -113,10 +117,13 @@ __asm__ volatile(\
         :"0" (to), "1" (from),"2" (n)\
 : "memory");\
 }
+#endif
 
 #undef MMREG_SIZE
 #if HAVE_SSE
 #define MMREG_SIZE 16
+#elif ARCH_LOONGSON
+#define MMREG_SIZE 8
 #else
 #define MMREG_SIZE 64 //8
 #endif
@@ -149,10 +156,61 @@ __asm__ volatile(\
 #undef MIN_LEN
 #ifdef HAVE_ONLY_MMX1
 #define MIN_LEN 0x800  /* 2K blocks */
+#elif ARCH_LOONGSON
+#define MIN_LEN 0x40
 #else
 #define MIN_LEN 0x40  /* 64-byte blocks */
 #endif
 
+#if ARCH_LOONGSON
+static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
+{
+    void *retval;
+    size_t i;
+
+    retval = to;
+
+__asm__ volatile (
+        "ld $0, (%0)                        \n"
+        "ld $0, 64(%0)                      \n"
+        "ld $0, 128(%0)                     \n"
+        "ld $0, 192(%0)                     \n"
+        "ld $0, 256(%0)                     \n"
+::"r"(from)
+    );
+
+    if (len >= MIN_LEN && (((intptr_t)from) & 15) ==0
+                       && (((intptr_t)to  ) & 15) ==0) {
+        i = len >> 6; /* len/64 */
+        len &= 63;
+
+        for (; i>0; i--) {
+            __asm__ volatile (
+                "ld $0, 320(%[from])            \n"
+                "gslqc1 $f0, $f2, 0(%[from])    \n"
+                "gslqc1 $f4, $f6, 16(%[from])   \n"
+                "gslqc1 $f8, $f10, 32(%[from])  \n"
+                "gslqc1 $f12, $f14, 48(%[from]) \n"
+                "gssqc1 $f0, $f2, 0(%[to])      \n"
+                "gssqc1 $f4, $f6, 16(%[to])     \n"
+                "gssqc1 $f8, $f10, 32(%[to])    \n"
+                "gssqc1 $f12, $f14, 48(%[to])   \n"
+                ::[from]"r"(from),[to]"r"(to)
+                : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+            );
+            from = ((const unsigned char *)from)+64;
+            to = ((unsigned char *)to)+64;
+        }
+    }
+    /*
+     *Now do the tail of the block
+     */
+    if (len) small_memcpy(to, from, len);
+    return retval;
+}
+
+#else /* ARCH_LOONGSON */
+
 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
 {
 void *retval;
@@ -379,10 +437,60 @@ static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
 if(len) small_memcpy(to, from, len);
 return retval;
 }
+#endif /* ARCH_LOONGSON */
 
 /**
  * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
  */
+#if ARCH_LOONGSON
+static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
+{
+    void *retval;
+    size_t i;
+
+    retval = to;
+
+__asm__ volatile (
+        "ld $0, (%0)                        \n"
+        "ld $0, 64(%0)                      \n"
+        "ld $0, 128(%0)                     \n"
+        "ld $0, 192(%0)                     \n"
+        "ld $0, 256(%0)                     \n"
+::"r"(from)
+    );
+
+    if (len >= MIN_LEN && (((intptr_t)from) & 15) ==0
+                       && (((intptr_t)to  ) & 15) ==0) {
+        i = len >> 6; /* len/64 */
+        len &= 63;
+
+        for (; i>0; i--) {
+            __asm__ volatile (
+                "ld $0, 320(%[from])            \n"
+                "gslqc1 $f0, $f2, 0(%[from])    \n"
+                "gslqc1 $f4, $f6, 16(%[from])   \n"
+                "gslqc1 $f8, $f10, 32(%[from])  \n"
+                "gslqc1 $f12, $f14, 48(%[from]) \n"
+                "gssqc1 $f0, $f2, 0(%[to])      \n"
+                "gssqc1 $f4, $f6, 16(%[to])     \n"
+                "gssqc1 $f8, $f10, 32(%[to])    \n"
+                "gssqc1 $f12, $f14, 48(%[to])   \n"
+                ::[from]"r"(from),[to]"r"(to)
+                : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+            );
+            from = ((const unsigned char *)from)+64;
+            to = ((unsigned char *)to)+64;
+        }
+    }
+    /*
+     *Now do the tail of the block
+     */
+    if (len) small_memcpy(to, from, len);
+    return retval;
+}
+
+#else /* ARCH_LOONGSON */
+
 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
 {
 void *retval;
@@ -461,3 +569,4 @@ static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
 if(len) small_memcpy(to, from, len);
 return retval;
 }
+#endif /* ARCH_LOONGSON */
diff --git a/libvo/fastmemcpy.h b/libvo/fastmemcpy.h
index 094d3ac..89ab04e 100644
--- a/libvo/fastmemcpy.h
+++ b/libvo/fastmemcpy.h
@@ -27,7 +27,7 @@
 void * fast_memcpy(void * to, const void * from, size_t len);
 void * mem2agpcpy(void * to, const void * from, size_t len);
 
-#if ! defined(CONFIG_FASTMEMCPY) || ! (HAVE_MMX || HAVE_MMX2 || HAVE_AMD3DNOW /* || HAVE_SSE || HAVE_SSE2 */)
+#if ! defined(CONFIG_FASTMEMCPY) || ! (ARCH_LOONGSON || HAVE_MMX || HAVE_MMX2 || HAVE_AMD3DNOW /* || HAVE_SSE || HAVE_SSE2 */)
 #define mem2agpcpy(a,b,c) memcpy(a,b,c)
 #define fast_memcpy(a,b,c) memcpy(a,b,c)
 #endif
-- 
2.1.4



More information about the MPlayer-dev-eng mailing list