[Ffmpeg-devel] [RFC] port cabac asm to AMD64

Reimar Döffinger Reimar.Doeffinger
Sat Oct 21 11:55:36 CEST 2006


Hello,
On Sat, Oct 21, 2006 at 01:52:22AM +0200, Michael Niedermayer wrote:
[...]
> except that patch ok, assuming it still works with current svn

Attached version should, except that I don't have any sample using
decode_significance_8x8_x86. If someone has one, please give URL or
upload to incoming.
Overall H.264 decoding speed for my sample is 5% faster with this (using
only a quick, imprecise mplayer -benchmark test),
though it should be possible to optimize further, mixing use of 64 and
32 bit registers is said to be quite slow.
If it's still fine I'd apply as soon as I have a sample using
decode_significance_8x8_x86, but not before MPlayer release.

Greetings,
Reimar D?ffinger
-------------- next part --------------
Index: libavcodec/cabac.h
===================================================================
--- libavcodec/cabac.h	(revision 6750)
+++ libavcodec/cabac.h	(working copy)
@@ -28,6 +28,12 @@
 
 //#undef NDEBUG
 #include <assert.h>
+#ifdef ARCH_X86_64
+#define ARCH_X86
+#endif
+#ifdef ARCH_X86
+#include "x86_cpu.h"
+#endif
 
 #define CABAC_BITS 16
 #define CABAC_MASK ((1<<CABAC_BITS)-1)
@@ -364,9 +370,15 @@
     //FIXME gcc generates duplicate load/stores for c->low and c->range
 #define LOW          "0"
 #define RANGE        "4"
+#ifdef ARCH_X86_64
+#define BYTESTART   "16"
+#define BYTE        "24"
+#define BYTEEND     "32"
+#else
 #define BYTESTART   "12"
 #define BYTE        "16"
 #define BYTEEND     "20"
+#endif
 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
     int bit;
 
@@ -403,14 +415,14 @@
 //eax:state ebx:low, edx:range, esi:RangeLPS
         "test %%bx, %%bx                        \n\t"
         " jnz 2f                                \n\t"
-        "movl "BYTE     "(%2), %%esi            \n\t"
+        "mov  "BYTE     "(%2), %%"REG_S"        \n\t"
         "subl $0xFFFF, %%ebx                    \n\t"
-        "movzwl (%%esi), %%ecx                  \n\t"
+        "movzwl (%%"REG_S"), %%ecx              \n\t"
         "bswap %%ecx                            \n\t"
         "shrl $15, %%ecx                        \n\t"
-        "addl $2, %%esi                         \n\t"
+        "add  $2, %%"REG_S"                     \n\t"
         "addl %%ecx, %%ebx                      \n\t"
-        "movl %%esi, "BYTE    "(%2)             \n\t"
+        "mov  %%"REG_S", "BYTE    "(%2)         \n\t"
         "jmp 2f                                 \n\t"
         "1:                                     \n\t"
 //eax:state ebx:low, edx:range, esi:RangeLPS
@@ -421,17 +433,17 @@
         "shll %%cl, %%edx                       \n\t"
         "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx   \n\t"
         "movb %%cl, (%1)                        \n\t"
-        "addl $1, %0                            \n\t"
+        "add  $1, %0                            \n\t"
         "test %%bx, %%bx                        \n\t"
         " jnz 2f                                \n\t"
 
-        "movl "BYTE     "(%2), %%ecx            \n\t"
-        "movzwl (%%ecx), %%esi                  \n\t"
+        "mov  "BYTE     "(%2), %%"REG_c"        \n\t"
+        "movzwl (%%"REG_c"), %%esi              \n\t"
         "bswap %%esi                            \n\t"
         "shrl $15, %%esi                        \n\t"
         "subl $0xFFFF, %%esi                    \n\t"
-        "addl $2, %%ecx                         \n\t"
-        "movl %%ecx, "BYTE    "(%2)             \n\t"
+        "add  $2, %%"REG_c"                     \n\t"
+        "mov  %%"REG_c", "BYTE    "(%2)         \n\t"
 
         "leal -1(%%ebx), %%ecx                  \n\t"
         "xorl %%ebx, %%ecx                      \n\t"
@@ -447,7 +459,7 @@
         "movl %%ebx, "LOW      "(%2)            \n\t"
         :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
         :"r"(state), "r"(c)
-        : "%ecx", "%ebx", "%edx", "%esi", "memory"
+        : "%"REG_c, "%ebx", "%edx", "%"REG_S, "memory"
     );
     bit&=1;
 #else /* BRANCHLESS_CABAC_DECODER */
@@ -493,13 +505,13 @@
         "shl    %%cl        , "low"                                     \n\t"\
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   1f                                                      \n\t"\
-        "mov "BYTE"("cabac"), %%ecx                                     \n\t"\
-        "movzwl (%%ecx)     , "tmp"                                     \n\t"\
+        "mov "BYTE"("cabac"), %%"REG_c"                                 \n\t"\
+        "movzwl (%%"REG_c")     , "tmp"                                 \n\t"\
         "bswap  "tmp"                                                   \n\t"\
         "shr    $15         , "tmp"                                     \n\t"\
         "sub    $0xFFFF     , "tmp"                                     \n\t"\
-        "add    $2          , %%ecx                                     \n\t"\
-        "mov    %%ecx       , "BYTE    "("cabac")                       \n\t"\
+        "add    $2          , %%"REG_c"                                 \n\t"\
+        "mov    %%"REG_c"   , "BYTE    "("cabac")                       \n\t"\
         "lea    -1("low")   , %%ecx                                     \n\t"\
         "xor    "low"       , %%ecx                                     \n\t"\
         "shr    $15         , %%ecx                                     \n\t"\
@@ -519,7 +531,7 @@
 
         :"=&a"(bit)
         :"r"(state), "r"(c)
-        : "%ecx", "%ebx", "%edx", "%esi", "memory"
+        : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
     );
     bit&=1;
 #endif /* BRANCHLESS_CABAC_DECODER */
@@ -588,20 +600,20 @@
         "add %%ebx, %%eax                       \n\t"
         "test %%ax, %%ax                        \n\t"
         " jnz 1f                                \n\t"
-        "movl "BYTE     "(%1), %%ebx            \n\t"
+        "movl "BYTE     "(%1), %%"REG_b"        \n\t"
         "subl $0xFFFF, %%eax                    \n\t"
-        "movzwl (%%ebx), %%ecx                  \n\t"
+        "movzwl (%%"REG_b"), %%ecx              \n\t"
         "bswap %%ecx                            \n\t"
         "shrl $15, %%ecx                        \n\t"
-        "addl $2, %%ebx                         \n\t"
+        "addl $2, %%"REG_b"                     \n\t"
         "addl %%ecx, %%eax                      \n\t"
-        "movl %%ebx, "BYTE     "(%1)            \n\t"
+        "movl %%"REG_b", "BYTE     "(%1)        \n\t"
         "1:                                     \n\t"
         "movl %%eax, "LOW      "(%1)            \n\t"
 
         :"=&d"(bit)
         :"r"(c)
-        : "%eax", "%ebx", "%ecx", "memory"
+        : "%eax", "%"REG_b, "%ecx", "memory"
     );
     return bit+1;
 #else
@@ -637,20 +649,20 @@
         "sub %%edx, %%ecx                       \n\t"
         "test %%ax, %%ax                        \n\t"
         " jnz 1f                                \n\t"
-        "movl "BYTE     "(%1), %%ebx            \n\t"
+        "mov  "BYTE     "(%1), %%"REG_b"        \n\t"
         "subl $0xFFFF, %%eax                    \n\t"
-        "movzwl (%%ebx), %%edx                  \n\t"
+        "movzwl (%%"REG_b"), %%edx              \n\t"
         "bswap %%edx                            \n\t"
         "shrl $15, %%edx                        \n\t"
-        "addl $2, %%ebx                         \n\t"
+        "add  $2, %%"REG_b"                     \n\t"
         "addl %%edx, %%eax                      \n\t"
-        "movl %%ebx, "BYTE     "(%1)            \n\t"
+        "mov  %%"REG_b", "BYTE     "(%1)        \n\t"
         "1:                                     \n\t"
         "movl %%eax, "LOW      "(%1)            \n\t"
 
         :"+c"(val)
         :"r"(c)
-        : "%eax", "%ebx", "%edx", "memory"
+        : "%eax", "%"REG_b, "%edx", "memory"
     );
     return val;
 #else
@@ -690,34 +702,34 @@
 
         BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
 
-        "movl %2, %%eax                         \n\t"
+        "mov  %2, %%"REG_a"                     \n\t"
         "movl %4, %%ecx                         \n\t"
-        "addl %1, %%ecx                         \n\t"
-        "movl %%ecx, (%%eax)                    \n\t"
+        "add  %1, %%"REG_c"                     \n\t"
+        "movl %%ecx, (%%"REG_a")                \n\t"
 
         "test $1, %%edx                         \n\t"
         " jnz 4f                                \n\t"
 
-        "addl $4, %%eax                         \n\t"
-        "movl %%eax, %2                         \n\t"
+        "add  $4, %%"REG_a"                     \n\t"
+        "mov  %%"REG_a", %2                     \n\t"
 
         "3:                                     \n\t"
-        "addl $1, %1                            \n\t"
-        "cmpl %5, %1                            \n\t"
+        "add  $1, %1                            \n\t"
+        "cmp  %5, %1                            \n\t"
         " jb 2b                                 \n\t"
-        "movl %2, %%eax                         \n\t"
+        "mov  %2, %%"REG_a"                     \n\t"
         "movl %4, %%ecx                         \n\t"
-        "addl %1, %%ecx                         \n\t"
-        "movl %%ecx, (%%eax)                    \n\t"
+        "add  %1, %%"REG_c"                     \n\t"
+        "movl %%ecx, (%%"REG_a")                \n\t"
         "4:                                     \n\t"
-        "addl %6, %%eax                         \n\t"
+        "add  %6, %%eax                         \n\t"
         "shr $2, %%eax                          \n\t"
 
         "movl %%esi, "RANGE    "(%3)            \n\t"
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\
         :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\
-        : "%ecx", "%ebx", "%edx", "%esi", "memory"\
+        : "%"REG_c, "%ebx", "%edx", "%esi", "memory"\
     );
     return coeff_count;
 }
@@ -725,16 +737,16 @@
 static int decode_significance_8x8_x86(CABACContext *c, uint8_t *significant_coeff_ctx_base, int *index, uint8_t *sig_off){
     int minusindex= 4-(int)index;
     int coeff_count;
-    int last=0;
+    long last=0;
     asm volatile(
         "movl "RANGE    "(%3), %%esi            \n\t"
         "movl "LOW      "(%3), %%ebx            \n\t"
 
-        "mov %1, %%edi                          \n\t"
+        "mov %1, %%"REG_D"                      \n\t"
         "2:                                     \n\t"
 
-        "mov %6, %%eax                          \n\t"
-        "movzbl (%%eax, %%edi), %%edi           \n\t"
+        "mov %6, %%"REG_a"                      \n\t"
+        "movzbl (%%"REG_a", %%"REG_D"), %%edi   \n\t"
         "add %5, %%edi                          \n\t"
 
         BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%edi)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
@@ -748,23 +760,23 @@
 
         BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%edi)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
 
-        "movl %2, %%eax                         \n\t"
+        "mov %2, %%"REG_a"                      \n\t"
         "mov %1, %%edi                          \n\t"
-        "movl %%edi, (%%eax)                    \n\t"
+        "movl %%edi, (%%"REG_a")                \n\t"
 
         "test $1, %%edx                         \n\t"
         " jnz 4f                                \n\t"
 
-        "addl $4, %%eax                         \n\t"
-        "movl %%eax, %2                         \n\t"
+        "add $4, %%"REG_a"                      \n\t"
+        "mov %%"REG_a", %2                      \n\t"
 
         "3:                                     \n\t"
         "addl $1, %%edi                         \n\t"
         "mov %%edi, %1                          \n\t"
         "cmpl $63, %%edi                        \n\t"
         " jb 2b                                 \n\t"
-        "movl %2, %%eax                         \n\t"
-        "movl %%edi, (%%eax)                    \n\t"
+        "mov %2, %%"REG_a"                      \n\t"
+        "movl %%edi, (%%"REG_a")                \n\t"
         "4:                                     \n\t"
         "addl %4, %%eax                         \n\t"
         "shr $2, %%eax                          \n\t"
@@ -773,7 +785,7 @@
         "movl %%ebx, "LOW      "(%3)            \n\t"
         :"=&a"(coeff_count),"+m"(last), "+m"(index)\
         :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)\
-        : "%ecx", "%ebx", "%edx", "%esi", "%edi", "memory"\
+        : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"\
     );
     return coeff_count;
 }



More information about the ffmpeg-devel mailing list