[FFmpeg-devel] [PATCH] h264: assembly version of get_cabac for x86_64 with PIC

Roland Scheidegger rscheidegger_lists at hispeed.ch
Fri Apr 13 03:46:13 CEST 2012


From: Roland Scheidegger <rscheideger_lists at hispeed.ch>

This adds a hand-optimized assembly version for get_cabac much like the
existing one, but it works if the table offsets are RIP-relative.
Compared to the non-RIP-relative version this adds 4 instructions
(3 RIP-relative movs, 1 lea) and needs one extra register, two of the
rip-relative movs could get eliminated by using a single table and using offets
instead.
Since x86_64 cpus always support cmov also always use this (I don't care
if you have a P4 Prescott whose cmov implementation is useless).
There is a surprisingly large performance improvement over the c version (more
so than the generated assembly seems to suggest) just in get_cabac, I measured
roughly 40% faster for get_cabac on a K8.
There are similar functions which could get the same treatment but they
are less frequently used and since this isn't very nice as we can't use the
same assembly template focus on this function alone for now.
---
 libavcodec/x86/cabac.h |   66 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 66 insertions(+), 0 deletions(-)

diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index 0c4419b..0fb3582 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -24,8 +24,73 @@
 #include "libavcodec/cabac.h"
 #include "libavutil/attributes.h"
 #include "libavutil/x86_cpu.h"
+#include "libavutil/internal.h"
 #include "config.h"
 
+#if defined(BROKEN_RELOCATIONS)
+#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, tmpq, tmp2q, byte, end) \
+        "movzbl "statep"    , "ret"                                     \n\t"\
+        "mov    "range"     , "tmp"                                     \n\t"\
+        "and    $0xC0       , "range"                                   \n\t"\
+        "lea    ("ret", "range", 2), %%ecx                              \n\t"\
+        "mov    ff_h264_lps_range at GOTPCREL(%%rip), "tmp2q"              \n\t"\
+        "movzbl ("tmp2q", %%rcx), "range"                               \n\t"\
+        "sub    "range"     , "tmp"                                     \n\t"\
+        "mov    "tmp"       , %%ecx                                     \n\t"\
+        "shl    $17         , "tmp"                                     \n\t"\
+        "cmp    "low"       , "tmp"                                     \n\t"\
+        "mov    ff_h264_norm_shift at GOTPCREL(%%rip), "tmp2q"             \n\t"\
+        "cmova  %%ecx       , "range"                                   \n\t"\
+        "sbb    %%rcx       , %%rcx                                     \n\t"\
+        "and    %%ecx       , "tmp"                                     \n\t"\
+        "xor    %%rcx       , "retq"                                    \n\t"\
+        "sub    "tmp"       , "low"                                     \n\t"\
+        "movzbl ("tmp2q", "rangeq"), %%ecx                              \n\t"\
+        "mov    ff_h264_mlps_state at GOTPCREL(%%rip), "tmpq"              \n\t"\
+        "shl    %%cl        , "range"                                   \n\t"\
+        "movzbl 128("tmpq", "retq"), "tmp"                              \n\t"\
+        "shl    %%cl        , "low"                                     \n\t"\
+        "mov    "tmpbyte"   , "statep"                                  \n\t"\
+        "test   "lowword"   , "lowword"                                 \n\t"\
+        " jnz   2f                                                      \n\t"\
+        "mov    "byte"      , %%"REG_c"                                 \n\t"\
+        "add"OPSIZE" $2     , "byte"                                    \n\t"\
+        "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
+        "lea    -1("low")   , %%ecx                                     \n\t"\
+        "xor    "low"       , %%ecx                                     \n\t"\
+        "shr    $15         , %%ecx                                     \n\t"\
+        "bswap  "tmp"                                                   \n\t"\
+        "shr    $15         , "tmp"                                     \n\t"\
+        "movzbl ("tmp2q", %%rcx), %%ecx                                 \n\t"\
+        "sub    $0xFFFF     , "tmp"                                     \n\t"\
+        "neg    %%ecx                                                   \n\t"\
+        "add    $7          , %%ecx                                     \n\t"\
+        "shl    %%cl        , "tmp"                                     \n\t"\
+        "add    "tmp"       , "low"                                     \n\t"\
+        "2:                                                             \n\t"
+
+#define get_cabac_inline get_cabac_inline_x86
+static av_always_inline int get_cabac_inline_x86(CABACContext *c,
+                                                 uint8_t *const state)
+{
+    int bit, tmp, tmp2;
+
+    __asm__ volatile(
+        BRANCHLESS_GET_CABAC("%0", "%q0", "(%5)", "%1", "%w1",
+                             "%2", "%q2", "%3", "%b3", "%q3", "%q4",
+                             "%a7(%6)", "%a8(%6)")
+        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp), "=&r"(tmp2)
+        : "r"(state), "r"(c),
+          "i"(offsetof(CABACContext, bytestream)),
+          "i"(offsetof(CABACContext, bytestream_end))
+        : "%"REG_c, "memory"
+    );
+    return bit & 1;
+}
+
+
+#else
+
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, statep, low, range, tmp)\
         "mov    "tmp"       , %%ecx     \n\t"\
@@ -103,6 +168,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     return bit & 1;
 }
 #endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif
 
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
-- 
1.7.1



More information about the ffmpeg-devel mailing list