[FFmpeg-cvslog] vp9/x86: save one register on 32bit idct32x32.

Ronald S. Bultje git at videolan.org
Tue Dec 16 03:17:59 CET 2014


ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Mon Dec 15 20:40:48 2014 -0500| [0a7964dca5e52536c05a72987c3d7dbb12add942] | committer: Michael Niedermayer

vp9/x86: save one register on 32bit idct32x32.

Fixes build on win32.

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0a7964dca5e52536c05a72987c3d7dbb12add942
---

 libavcodec/x86/vp9itxfm.asm |   51 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index 908040c..64859a0 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -2526,7 +2526,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
 
 %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
 INIT_XMM %1
-cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride, block, eob
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+    movifnidn         eobd, dword eobm
 %if cpuflag(ssse3)
     cmp eobd, 135
     jg .idctfull
@@ -2540,6 +2541,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
 %endif
 
     ; dc-only case
+    movifnidn       blockq, blockmp
+    movifnidn         dstq, dstmp
+    movifnidn      strideq, stridemp
 %if cpuflag(ssse3)
     movd                m0, [blockq]
     mova                m1, [pw_11585x2]
@@ -2572,15 +2576,22 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
 %if ARCH_X86_64
     DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
 %else
-    DEFINE_ARGS dst, stride, block, stride30, dst_end, stride2, tmp
-%define cntd dword r4m
 %define dst_bakq r0mp
 %endif
 %if cpuflag(ssse3)
 .idct8x8:
+%if ARCH_X86_32
+    DEFINE_ARGS block, u1, u2, u3, u4, tmp
+    mov             blockq, r2mp
+%endif
     mov               tmpq, rsp
     VP9_IDCT32_1D   blockq, 1, 8
 
+%if ARCH_X86_32
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
     mov          stride30q, strideq         ; stride
     lea           stride2q, [strideq*2]     ; stride*2
     shl          stride30q, 5               ; stride*32
@@ -2597,10 +2608,18 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
 
     ; at the end of the loop, m7 should still be zero
     ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
     ZERO_BLOCK      blockq, 64,  8, m1
     RET
 
 .idct16x16:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
     mov               cntd, 2
     mov               tmpq, rsp
 .loop1_16x16:
@@ -2609,7 +2628,14 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
     add               tmpq, 512
     dec               cntd
     jg .loop1_16x16
+
+%if ARCH_X86_64
     sub             blockq, 32
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
 
     mov          stride30q, strideq         ; stride
     lea           stride2q, [strideq*2]     ; stride*2
@@ -2628,11 +2654,19 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
 
     ; at the end of the loop, m7 should still be zero
     ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
     ZERO_BLOCK      blockq, 64, 16, m1
     RET
 %endif
 
 .idctfull:
+%if ARCH_X86_32
+    DEFINE_ARGS block, tmp, cnt
+    mov             blockq, r2mp
+%endif
     mov               cntd, 4
     mov               tmpq, rsp
 .loop1_full:
@@ -2641,7 +2675,14 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
     add               tmpq, 512
     dec               cntd
     jg .loop1_full
+
+%if ARCH_X86_64
     sub             blockq, 64
+%else
+    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+    mov            strideq, r1mp
+%define cntd dword r3m
+%endif
 
     mov          stride30q, strideq         ; stride
     lea           stride2q, [strideq*2]     ; stride*2
@@ -2660,6 +2701,10 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
 
     ; at the end of the loop, m7 should still be zero
     ; use that to zero out block coefficients
+%if ARCH_X86_32
+    DEFINE_ARGS block
+    mov             blockq, r2mp
+%endif
     ZERO_BLOCK      blockq, 64, 32, m1
     RET
 %endmacro



More information about the ffmpeg-cvslog mailing list