[FFmpeg-cvslog] x86: huffyuvdsp: port add_bytes to yasm

Christophe Gisquet git at videolan.org
Thu May 29 22:30:09 CEST 2014


ffmpeg | branch: master | Christophe Gisquet <christophe.gisquet at gmail.com> | Wed May 28 15:52:24 2014 +0200| [99a319c4e7538670847ac4633ef8b0f0629deb22] | committer: Michael Niedermayer

x86: huffyuvdsp: port add_bytes to yasm

          C   MMX  SSE2
Cycles: 2972  587  302

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=99a319c4e7538670847ac4633ef8b0f0629deb22
---

 libavcodec/huffyuvdsp.c             |    2 +-
 libavcodec/huffyuvdsp.h             |    2 +-
 libavcodec/ppc/huffyuvdsp_altivec.c |    2 +-
 libavcodec/x86/huffyuvdsp.asm       |   37 +++++++++++++++++++++++++++++++++++
 libavcodec/x86/huffyuvdsp_init.c    |    9 +++++++--
 libavcodec/x86/huffyuvdsp_mmx.c     |   32 +-----------------------------
 6 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c
index cbc09cf..3d51552 100644
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@@ -27,7 +27,7 @@
 #define pb_7f (~0UL / 255 * 0x7f)
 #define pb_80 (~0UL / 255 * 0x80)
 
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     long i;
 
diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h
index fd66f0a..c52dd69 100644
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@@ -35,7 +35,7 @@
 
 typedef struct HuffYUVDSPContext {
     void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
-                      int w);
+                      intptr_t w);
     void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
                                  const uint8_t *diff, int w,
                                  int *left, int *left_top);
diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/huffyuvdsp_altivec.c
index ff2bd87..0052dae 100644
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/huffyuvdsp_altivec.c
@@ -31,7 +31,7 @@
 #include "libavcodec/huffyuvdsp.h"
 
 #if HAVE_ALTIVEC
-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
 {
     register int i;
     register vector unsigned char vdst, vsrc;
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index f183ebe..a923e70 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
     ADD_HFYU_LEFT_LOOP 0, 1
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+INIT_MMX mmx
+ADD_BYTES
+INIT_XMM sse2
+ADD_BYTES
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 1efb34d..8a755e6 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -23,7 +23,8 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
 
 void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
                                   const uint8_t *diff, int w,
@@ -46,7 +47,7 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
         c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
 #endif
 
-    if (INLINE_MMX(cpu_flags))
+    if (EXTERNAL_MMX(cpu_flags))
         c->add_bytes = ff_add_bytes_mmx;
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
@@ -55,6 +56,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
             c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
     }
 
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes            = ff_add_bytes_sse2;
+    }
+
     if (EXTERNAL_SSSE3(cpu_flags)) {
         c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
         if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
diff --git a/libavcodec/x86/huffyuvdsp_mmx.c b/libavcodec/x86/huffyuvdsp_mmx.c
index 5942210..ee6ec91 100644
--- a/libavcodec/x86/huffyuvdsp_mmx.c
+++ b/libavcodec/x86/huffyuvdsp_mmx.c
@@ -22,9 +22,7 @@
 #include "libavutil/x86/asm.h"
 #include "huffyuvdsp.h"
 
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS
 void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
                                   const uint8_t *diff, int w,
                                   int *left, int *left_top)
@@ -61,31 +59,3 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
     *left_top = tl;
 }
 #endif
-
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */



More information about the ffmpeg-cvslog mailing list