[FFmpeg-cvslog] x86/vp3dsp: port put_vp_no_rnd_pixels8_l2_mmx to yasm

James Almer git at videolan.org
Sat Dec 20 14:53:51 CET 2014


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Sat Dec 20 03:18:57 2014 -0300| [7696e429c7608c2959570a7bc448a76bbfa70dd4] | committer: Michael Niedermayer

x86/vp3dsp: port put_vp_no_rnd_pixels8_l2_mmx to yasm

Signed-off-by: James Almer <jamrial at gmail.com>
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7696e429c7608c2959570a7bc448a76bbfa70dd4
---

 libavcodec/x86/vp3dsp.asm    |   44 +++++++++++++++++++++++++++
 libavcodec/x86/vp3dsp_init.c |   69 ++++--------------------------------------
 2 files changed, 50 insertions(+), 63 deletions(-)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 24496ae..7ad4c31 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -36,6 +36,7 @@ vp3_idct_data: times 8 dw 64277
 pb_7:  times 8 db 0x07
 pb_1F: times 8 db 0x1f
 pb_81: times 8 db 0x81
+pb_FE: times 8 db 0xFE
 
 cextern pb_1
 cextern pb_3
@@ -147,6 +148,49 @@ cglobal vp3_h_loop_filter, 3, 4
     STORE_4_WORDS m3
     RET
 
+%macro PAVGB_NO_RND 0
+    mova   m4, m0
+    mova   m5, m2
+    pand   m4, m1
+    pand   m5, m3
+    pxor   m1, m0
+    pxor   m3, m2
+    pand   m1, m6
+    pand   m3, m6
+    psrlq  m1, 1
+    psrlq  m3, 1
+    paddb  m4, m1
+    paddb  m5, m3
+%endmacro
+
+INIT_MMX mmx
+cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
+    mova   m6, [pb_FE]
+    lea    stride3q,[strideq+strideq*2]
+.loop
+    mova   m0, [src1q]
+    mova   m1, [src2q]
+    mova   m2, [src1q+strideq]
+    mova   m3, [src2q+strideq]
+    PAVGB_NO_RND
+    mova   [dstq], m4
+    mova   [dstq+strideq], m5
+
+    mova   m0, [src1q+strideq*2]
+    mova   m1, [src2q+strideq*2]
+    mova   m2, [src1q+stride3q]
+    mova   m3, [src2q+stride3q]
+    PAVGB_NO_RND
+    mova   [dstq+strideq*2], m4
+    mova   [dstq+stride3q],  m5
+
+    lea    src1q, [src1q+strideq*4]
+    lea    src2q, [src2q+strideq*4]
+    lea    dstq,  [dstq+strideq*4]
+    sub    hd, 4
+    jnz .loop
+    RET
+
 ; from original comments: The Macro does IDct on 4 1-D Dcts
 %macro BeginIDCT 0
     movq          m2, I(3)
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index cc3eba4..354e1a1 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -23,10 +23,8 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavutil/x86/asm.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
-#include "config.h"
 
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
@@ -42,76 +40,21 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
 void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
                                  int *bounding_values);
 
-#if HAVE_MMX_INLINE
-
-#define MOVQ_BFE(regd)                                  \
-    __asm__ volatile (                                  \
-        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
-        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
-
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
-    "movq  "#rega", "#regr"             \n\t"                    \
-    "movq  "#regc", "#regp"             \n\t"                    \
-    "pand  "#regb", "#regr"             \n\t"                    \
-    "pand  "#regd", "#regp"             \n\t"                    \
-    "pxor  "#rega", "#regb"             \n\t"                    \
-    "pxor  "#regc", "#regd"             \n\t"                    \
-    "pand    %%mm6, "#regb"             \n\t"                    \
-    "pand    %%mm6, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regb"             \n\t"                    \
-    "psrlq      $1, "#regd"             \n\t"                    \
-    "paddb "#regb", "#regr"             \n\t"                    \
-    "paddb "#regd", "#regp"             \n\t"
-
-#if HAVE_6REGS
-static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
-{
-//    START_TIMER
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   (%1,%4), %%mm2          \n\t"
-        "movq   (%2,%4), %%mm3          \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, (%3,%4)          \n\t"
-
-        "movq   (%1,%4,2), %%mm0        \n\t"
-        "movq   (%2,%4,2), %%mm1        \n\t"
-        "movq   (%1,%5), %%mm2          \n\t"
-        "movq   (%2,%5), %%mm3          \n\t"
-        "lea    (%1,%4,4), %1           \n\t"
-        "lea    (%2,%4,4), %2           \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3,%4,2)        \n\t"
-        "movq   %%mm5, (%3,%5)          \n\t"
-        "lea    (%3,%4,4), %3           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
-        :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
-        :"memory");
-//    STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
-}
-#endif /*HAVE_6REGS */
-#endif /* HAVE_MMX_INLINE */
+void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
+                                     const uint8_t *b, ptrdiff_t stride,
+                                     int h);
 
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_6REGS && HAVE_MMX_INLINE
-    c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
-#endif /* HAVE_6REGS && HAVE_MMX_INLINE */
-
-#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
+#if ARCH_X86_32
         c->idct_put  = ff_vp3_idct_put_mmx;
         c->idct_add  = ff_vp3_idct_add_mmx;
-    }
 #endif
+    }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;



More information about the ffmpeg-cvslog mailing list