[FFmpeg-devel] [PATCH] vp9/x86: use explicit register for relative stack references.

Ronald S. Bultje rsbultje at gmail.com
Sat Jan 25 00:49:17 CET 2014


Before this patch, we explicitly modify rsp, which isn't necessarily
universally acceptable, since the space under the stack pointer might
be modified in things like signal handlers. Therefore, use an explicit
register to hold the stack pointer relative to the bottom of the stack
(i.e. rsp). This will also clear out valgrind errors about the use of
uninitialized data that started occurring after the idct16x16/ssse3
optimizations were first merged.
---
 libavcodec/x86/vp9itxfm.asm | 440 ++++++++++++++++++++++----------------------
 1 file changed, 218 insertions(+), 222 deletions(-)

diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index b142b8f..8087c2e 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -772,40 +772,40 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx
 %endmacro
 
 %macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
-    VP9_IDCT16_1D_START %1, %3, 32, rsp+32
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq+32
 
 %if %2 == 1
     ; backup a different register
-    mova          [rsp+16], m15
-    mova                m7, [rsp+32]
+    mova         [tmpq+16], m15
+    mova                m7, [tmpq+32]
 
     SUMSUB_BA            w,  6,  9, 15      ; t6, t9
     SUMSUB_BA            w,  7,  8, 15      ; t7, t8
 
     TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
-    mova         [rsp+  0], m0
-    mova         [rsp+ 32], m1
-    mova         [rsp+ 64], m2
-    mova         [rsp+ 96], m3
-    mova         [rsp+128], m4
-    mova         [rsp+160], m5
-    mova         [rsp+192], m6
-    mova         [rsp+224], m7
-
-    mova               m15, [rsp+16]
+    mova        [tmpq+  0], m0
+    mova        [tmpq+ 32], m1
+    mova        [tmpq+ 64], m2
+    mova        [tmpq+ 96], m3
+    mova        [tmpq+128], m4
+    mova        [tmpq+160], m5
+    mova        [tmpq+192], m6
+    mova        [tmpq+224], m7
+
+    mova               m15, [tmpq+16]
     TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
-    mova         [rsp+ 16], m8
-    mova         [rsp+ 48], m9
-    mova         [rsp+ 80], m10
-    mova         [rsp+112], m11
-    mova         [rsp+144], m12
-    mova         [rsp+176], m13
-    mova         [rsp+208], m14
-    mova         [rsp+240], m15
+    mova        [tmpq+ 16], m8
+    mova        [tmpq+ 48], m9
+    mova        [tmpq+ 80], m10
+    mova        [tmpq+112], m11
+    mova        [tmpq+144], m12
+    mova        [tmpq+176], m13
+    mova        [tmpq+208], m14
+    mova        [tmpq+240], m15
 %else ; %2 == 2
     ; backup more registers
-    mova          [rsp+64], m8
-    mova          [rsp+96], m9
+    mova         [tmpq+64], m8
+    mova         [tmpq+96], m9
 
     pxor                m7, m7
     pmulhrsw            m0, [pw_512]
@@ -823,9 +823,9 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx
 
     ; restore from cache
     SWAP                 0, 7               ; move zero from m7 to m0
-    mova                m7, [rsp+32]
-    mova                m8, [rsp+64]
-    mova                m9, [rsp+96]
+    mova                m7, [tmpq+32]
+    mova                m8, [tmpq+64]
+    mova                m9, [tmpq+96]
 
     SUMSUB_BA            w,  6,  9, 1       ; t6, t9
     SUMSUB_BA            w,  7,  8, 1       ; t7, t8
@@ -871,7 +871,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx
 
 %macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
 INIT_XMM %1
-cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
     ; 2x2=eob=3, 4x4=eob=10
     cmp eobd, 38
     jg .idctfull
@@ -894,19 +894,19 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
     VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
     RET
 
+    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
 .idct8x8:
-    DEFINE_ARGS dst, stride, block, cnt, dst_bak
+    mov               tmpq, rsp
     VP9_IDCT16_1D   blockq, 1, 8
 
     mov               cntd, 2
     mov           dst_bakq, dstq
 .loop2_8x8:
-    VP9_IDCT16_1D      rsp, 2, 8
+    VP9_IDCT16_1D     tmpq, 2, 8
     lea               dstq, [dst_bakq+8]
-    add                rsp, 16
+    add               tmpq, 16
     dec               cntd
     jg .loop2_8x8
-    sub                rsp, 32
 
     ; at the end of the loop, m0 should still be zero
     ; use that to zero out block coefficients
@@ -914,26 +914,25 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
     RET
 
 .idctfull:
-    DEFINE_ARGS dst, stride, block, cnt, dst_bak
     mov               cntd, 2
+    mov               tmpq, rsp
 .loop1_full:
     VP9_IDCT16_1D   blockq, 1
     add             blockq, 16
-    add                rsp, 256
+    add               tmpq, 256
     dec               cntd
     jg .loop1_full
     sub             blockq, 32
-    sub                rsp, 512
 
     mov               cntd, 2
+    mov               tmpq, rsp
     mov           dst_bakq, dstq
 .loop2_full:
-    VP9_IDCT16_1D      rsp, 2
+    VP9_IDCT16_1D     tmpq, 2
     lea               dstq, [dst_bakq+8]
-    add                rsp, 16
+    add               tmpq, 16
     dec               cntd
     jg .loop2_full
-    sub                rsp, 32
 
     ; at the end of the loop, m0 should still be zero
     ; use that to zero out block coefficients
@@ -970,7 +969,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
     VP9_RND_SH_SUMSUB_BA     2, 11,  5,  7, 12, [pd_8192]   ; m2=t2[w], m11=t10[w]
     VP9_RND_SH_SUMSUB_BA     3, 10,  4,  6, 12, [pd_8192]   ; m3=t3[w], m10=t11[w]
 
-    mova    [rsp+ 0*%%str], m9          ; make some scratch space (t0:m9->r0)
+    mova   [tmpq+ 0*%%str], m9          ; make some scratch space (t0:m9->r0)
     mova                m4, [%1+ 4*32]  ; in4
     mova                m5, [%1+11*32]  ; in11
     mova               m12, [%1+ 3*32]  ; in3
@@ -981,10 +980,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
     VP9_RND_SH_SUMSUB_BA    13,  4, 15,  6,  9, [pd_8192]   ; m13=t4[w], m4=t12[w]
     VP9_RND_SH_SUMSUB_BA    12,  5, 14,  7,  9, [pd_8192]   ; m12=t5[w], m5=t13[w]
 
-    mova    [rsp+ 2*%%str], m8          ; t1:m9->r2
-    mova    [rsp+ 3*%%str], m2          ; t2:m2->r3
-    mova    [rsp+ 4*%%str], m3          ; t3:m3->r4
-    mova    [rsp+ 5*%%str], m13         ; t4:m13->r5
+    mova   [tmpq+ 2*%%str], m8          ; t1:m9->r2
+    mova   [tmpq+ 3*%%str], m2          ; t2:m2->r3
+    mova   [tmpq+ 4*%%str], m3          ; t3:m3->r4
+    mova   [tmpq+ 5*%%str], m13         ; t4:m13->r5
     mova                m2, [%1+ 6*32]  ; in6
     mova                m3, [%1+ 9*32]  ; in9
     mova                m8, [%1+ 1*32]  ; in1
@@ -1030,16 +1029,16 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
 
     ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
 
-    mova                    m6, [rsp+ 0*%%str]
-    mova                    m7, [rsp+ 2*%%str]
-    mova                   m13, [rsp+ 3*%%str]
-    mova                   m14, [rsp+ 4*%%str]
-    mova                   m15, [rsp+ 5*%%str]
-    mova        [rsp+ 8*%%str], m5
-    mova        [rsp+ 9*%%str], m4
-    mova        [rsp+10*%%str], m0
-    mova        [rsp+11*%%str], m10
-    mova        [rsp+12*%%str], m2
+    mova                    m6, [tmpq+ 0*%%str]
+    mova                    m7, [tmpq+ 2*%%str]
+    mova                   m13, [tmpq+ 3*%%str]
+    mova                   m14, [tmpq+ 4*%%str]
+    mova                   m15, [tmpq+ 5*%%str]
+    mova       [tmpq+ 8*%%str], m5
+    mova       [tmpq+ 9*%%str], m4
+    mova       [tmpq+10*%%str], m0
+    mova       [tmpq+11*%%str], m10
+    mova       [tmpq+12*%%str], m2
 
     ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
     ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
@@ -1069,32 +1068,32 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
     ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
 
 %if %2 == 1
-    mova                    m0, [rsp+ 8*%%str]
+    mova                    m0, [tmpq+ 8*%%str]
     TRANSPOSE8x8W            9, 3, 11, 14, 7, 1, 0, 12, 2
-    mova           [rsp+ 0*16], m9
-    mova           [rsp+ 2*16], m3
-    mova           [rsp+ 4*16], m11
-    mova           [rsp+ 6*16], m14
-    mova                    m9, [rsp+ 9*%%str]
-    mova                    m3, [rsp+10*%%str]
-    mova                   m11, [rsp+11*%%str]
-    mova                   m14, [rsp+12*%%str]
-    mova           [rsp+ 8*16], m7
-    mova           [rsp+10*16], m1
-    mova           [rsp+12*16], m0
-    mova           [rsp+14*16], m12
+    mova          [tmpq+ 0*16], m9
+    mova          [tmpq+ 2*16], m3
+    mova          [tmpq+ 4*16], m11
+    mova          [tmpq+ 6*16], m14
+    mova                    m9, [tmpq+ 9*%%str]
+    mova                    m3, [tmpq+10*%%str]
+    mova                   m11, [tmpq+11*%%str]
+    mova                   m14, [tmpq+12*%%str]
+    mova          [tmpq+ 8*16], m7
+    mova          [tmpq+10*16], m1
+    mova          [tmpq+12*16], m0
+    mova          [tmpq+14*16], m12
 
     TRANSPOSE8x8W           15, 9, 3, 6, 13, 11, 14, 8, 2
-    mova           [rsp+ 1*16], m15
-    mova           [rsp+ 3*16], m9
-    mova           [rsp+ 5*16], m3
-    mova           [rsp+ 7*16], m6
-    mova           [rsp+ 9*16], m13
-    mova           [rsp+11*16], m11
-    mova           [rsp+13*16], m14
-    mova           [rsp+15*16], m8
+    mova          [tmpq+ 1*16], m15
+    mova          [tmpq+ 3*16], m9
+    mova          [tmpq+ 5*16], m3
+    mova          [tmpq+ 7*16], m6
+    mova          [tmpq+ 9*16], m13
+    mova          [tmpq+11*16], m11
+    mova          [tmpq+13*16], m14
+    mova          [tmpq+15*16], m8
 %else
-    mova                    m5, [rsp+ 8*%%str]
+    mova                    m5, [tmpq+ 8*%%str]
     pxor                    m0, m0
 
     pmulhrsw                m9, [pw_512]
@@ -1114,10 +1113,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
     VP9_STORE_2X             5, 12, 2, 4, 0
     lea                   dstq, [dstq+strideq*2]
 
-    mova                    m9, [rsp+ 9*%%str]
-    mova                    m3, [rsp+10*%%str]
-    mova                   m11, [rsp+11*%%str]
-    mova                   m14, [rsp+12*%%str]
+    mova                    m9, [tmpq+ 9*%%str]
+    mova                    m3, [tmpq+10*%%str]
+    mova                   m11, [tmpq+11*%%str]
+    mova                   m14, [tmpq+12*%%str]
 
     pmulhrsw               m15, [pw_512]
     pmulhrsw                m9, [pw_512]
@@ -1139,29 +1138,26 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
 
 %macro IADST16_FN 5
 INIT_XMM %5
-cglobal vp9_%1_%3_16x16_add, 3, 5, 16, 512, dst, stride, block, eob
-    ; potential eob checks go here
-
-    DEFINE_ARGS dst, stride, block, cnt, dst_bak
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
     mov               cntd, 2
+    mov               tmpq, rsp
 .loop1_full:
     VP9_%2_1D       blockq, 1
     add             blockq, 16
-    add                rsp, 256
+    add               tmpq, 256
     dec               cntd
     jg .loop1_full
     sub             blockq, 32
-    sub                rsp, 512
 
     mov               cntd, 2
+    mov               tmpq, rsp
     mov           dst_bakq, dstq
 .loop2_full:
-    VP9_%4_1D          rsp, 2
+    VP9_%4_1D         tmpq, 2
     lea               dstq, [dst_bakq+8]
-    add                rsp, 16
+    add               tmpq, 16
     dec               cntd
     jg .loop2_full
-    sub                rsp, 32
 
     ; at the end of the loop, m0 should still be zero
     ; use that to zero out block coefficients
@@ -1183,11 +1179,11 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
 %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
 %assign %%str 16*%2*%2
     ; first do t0-15, this can be done identical to idct16x16
-    VP9_IDCT16_1D_START %1, %3/2, 64*2, rsp+ 4*%%str
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str
 
     ; backup a different register
-    mova     [rsp+30*%%str], m15    ; t15
-    mova                m7, [rsp+ 4*%%str]
+    mova    [tmpq+30*%%str], m15    ; t15
+    mova                m7, [tmpq+ 4*%%str]
 
     SUMSUB_BA            w,  6,  9, 15      ; t6, t9
     SUMSUB_BA            w,  7,  8, 15      ; t7, t8
@@ -1195,21 +1191,21 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     ; store everything on stack to make space available for t16-31
     ; we store interleaved with the output of the second half (t16-31)
     ; so we don't need to allocate extra stack space
-    mova     [rsp+ 0*%%str], m0     ; t0
-    mova     [rsp+ 4*%%str], m1     ; t1
-    mova     [rsp+ 8*%%str], m2     ; t2
-    mova     [rsp+12*%%str], m3     ; t3
-    mova     [rsp+16*%%str], m4     ; t4
-    mova     [rsp+20*%%str], m5     ; t5
-    mova     [rsp+24*%%str], m6     ; t6
-    mova     [rsp+28*%%str], m7     ; t7
-    mova     [rsp+ 2*%%str], m8     ; t8
-    mova     [rsp+ 6*%%str], m9     ; t9
-    mova     [rsp+10*%%str], m10    ; t10
-    mova     [rsp+14*%%str], m11    ; t11
-    mova     [rsp+18*%%str], m12    ; t12
-    mova     [rsp+22*%%str], m13    ; t13
-    mova     [rsp+26*%%str], m14    ; t14
+    mova    [tmpq+ 0*%%str], m0     ; t0
+    mova    [tmpq+ 4*%%str], m1     ; t1
+    mova    [tmpq+ 8*%%str], m2     ; t2
+    mova    [tmpq+12*%%str], m3     ; t3
+    mova    [tmpq+16*%%str], m4     ; t4
+    mova    [tmpq+20*%%str], m5     ; t5
+    mova    [tmpq+24*%%str], m6     ; t6
+    mova    [tmpq+28*%%str], m7     ; t7
+    mova    [tmpq+ 2*%%str], m8     ; t8
+    mova    [tmpq+ 6*%%str], m9     ; t9
+    mova    [tmpq+10*%%str], m10    ; t10
+    mova    [tmpq+14*%%str], m11    ; t11
+    mova    [tmpq+18*%%str], m12    ; t12
+    mova    [tmpq+22*%%str], m13    ; t13
+    mova    [tmpq+26*%%str], m14    ; t14
 
     ; then, secondly, do t16-31
 %if %3 <= 8
@@ -1235,8 +1231,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     ; from 1 stage forward
     SUMSUB_BA                 w,  8,  4,  1
     ; temporary storage
-    mova     [rsp+17*%%str], m8             ; t16
-    mova     [rsp+21*%%str], m4             ; t19
+    mova    [tmpq+17*%%str], m8             ; t16
+    mova    [tmpq+21*%%str], m4             ; t19
     VP9_UNPACK_MULSUB_2W_4X   1, 14, 15,  0,  9102, 13623, [pd_8192], 4,  8 ; t21, t26
     VP9_UNPACK_MULSUB_2W_4X  13,  2,  3, 12, 13623, m9102, [pd_8192], 4,  8 ; t22, t25
 
@@ -1289,8 +1285,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     ; from 2 stages forward
     SUMSUB_BA             w,  8,  4,  2
     ; temporary storage
-    mova     [rsp+17*%%str], m8             ; t16
-    mova     [rsp+21*%%str], m4             ; t19
+    mova    [tmpq+17*%%str], m8             ; t16
+    mova    [tmpq+21*%%str], m4             ; t19
 %if %3 <= 16
     pmulhrsw             m3, m12, [pw_13160x2]
     pmulhrsw            m12, [pw_9760x2]
@@ -1336,7 +1332,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
     ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
 
-    mova                 m8, [rsp+17*%%str] ; t16
+    mova                 m8, [tmpq+17*%%str] ; t16
     ; from 2 stages forward
     SUMSUB_BA             w,  0,  8,  4
     SUMSUB_BA             w, 15,  7,  4
@@ -1345,10 +1341,10 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     pmulhrsw             m7, [pw_11585x2]
     pmulhrsw             m8, [pw_11585x2]
     ; store t16/t23
-    mova     [rsp+ 1*%%str], m0     ; t16
-    mova     [rsp+29*%%str], m7     ; t23
+    mova    [tmpq+ 1*%%str], m0     ; t16
+    mova    [tmpq+29*%%str], m7     ; t23
 
-    mova                 m4, [rsp+21*%%str] ; t19
+    mova                 m4, [tmpq+21*%%str] ; t19
     VP9_UNPACK_MULSUB_2W_4X  10,  5, 15137,  6270, [pd_8192], 0, 7 ; t18, t29
     VP9_UNPACK_MULSUB_2W_4X  11,  4, 15137,  6270, [pd_8192], 0, 7 ; t19, t28
     VP9_UNPACK_MULSUB_2W_4X   3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27
@@ -1384,27 +1380,27 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for
     ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for
     ; final sumsub of pass 2
-    mova     [rsp+ 5*%%str], m1     ; t17
-    mova     [rsp+ 9*%%str], m2     ; t18
-    mova     [rsp+13*%%str], m3     ; t19
+    mova    [tmpq+ 5*%%str], m1     ; t17
+    mova    [tmpq+ 9*%%str], m2     ; t18
+    mova    [tmpq+13*%%str], m3     ; t19
 
     ; then do final pass to sumsub+store the two halves
 %if %2 == 1
-    mova     [rsp+17*%%str], m4     ; t20
-    mova     [rsp+21*%%str], m5     ; t21
-    mova     [rsp+25*%%str], m6     ; t22
-
-    mova                 m0, [rsp+ 0*%%str] ; t0
-    mova                 m1, [rsp+ 4*%%str] ; t1
-    mova                 m2, [rsp+ 8*%%str] ; t2
-    mova                 m3, [rsp+12*%%str] ; t3
-    mova                 m4, [rsp+16*%%str] ; t4
-    mova                 m5, [rsp+20*%%str] ; t5
-    mova                 m6, [rsp+24*%%str] ; t6
+    mova    [tmpq+17*%%str], m4     ; t20
+    mova    [tmpq+21*%%str], m5     ; t21
+    mova    [tmpq+25*%%str], m6     ; t22
+
+    mova                 m0, [tmpq+ 0*%%str] ; t0
+    mova                 m1, [tmpq+ 4*%%str] ; t1
+    mova                 m2, [tmpq+ 8*%%str] ; t2
+    mova                 m3, [tmpq+12*%%str] ; t3
+    mova                 m4, [tmpq+16*%%str] ; t4
+    mova                 m5, [tmpq+20*%%str] ; t5
+    mova                 m6, [tmpq+24*%%str] ; t6
 
     SUMSUB_BA             w, 15,  0, 7
-    mova     [rsp+ 3*%%str], m0             ; t15
-    mova                 m7, [rsp+28*%%str] ; t7
+    mova    [tmpq+ 3*%%str], m0              ; t15
+    mova                 m7, [tmpq+28*%%str] ; t7
     SUMSUB_BA             w, 14,  1, 0
     SUMSUB_BA             w, 13,  2, 0
     SUMSUB_BA             w, 12,  3, 0
@@ -1414,45 +1410,45 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     SUMSUB_BA             w,  8,  7, 0
 
     TRANSPOSE8x8W        15, 14, 13, 12, 11, 10, 9, 8, 0
-    mova     [rsp+ 0*%%str], m15
-    mova     [rsp+ 4*%%str], m14
-    mova     [rsp+ 8*%%str], m13
-    mova     [rsp+12*%%str], m12
-    mova     [rsp+16*%%str], m11
-    mova     [rsp+20*%%str], m10
-    mova     [rsp+24*%%str], m9
-    mova     [rsp+28*%%str], m8
-
-    mova                  m0, [rsp+ 3*%%str] ; t15
+    mova    [tmpq+ 0*%%str], m15
+    mova    [tmpq+ 4*%%str], m14
+    mova    [tmpq+ 8*%%str], m13
+    mova    [tmpq+12*%%str], m12
+    mova    [tmpq+16*%%str], m11
+    mova    [tmpq+20*%%str], m10
+    mova    [tmpq+24*%%str], m9
+    mova    [tmpq+28*%%str], m8
+
+    mova                  m0, [tmpq+ 3*%%str] ; t15
     TRANSPOSE8x8W          7, 6, 5, 4, 3, 2, 1, 0, 8
-    mova     [rsp+ 3*%%str], m7
-    mova     [rsp+ 7*%%str], m6
-    mova     [rsp+11*%%str], m5
-    mova     [rsp+15*%%str], m4
-    mova     [rsp+19*%%str], m3
-    mova     [rsp+23*%%str], m2
-    mova     [rsp+27*%%str], m1
-    mova     [rsp+31*%%str], m0
-
-    mova                m15, [rsp+ 2*%%str] ; t8
-    mova                m14, [rsp+ 6*%%str] ; t9
-    mova                m13, [rsp+10*%%str] ; t10
-    mova                m12, [rsp+14*%%str] ; t11
-    mova                m11, [rsp+18*%%str] ; t12
-    mova                m10, [rsp+22*%%str] ; t13
-    mova                 m9, [rsp+26*%%str] ; t14
-    mova                 m8, [rsp+30*%%str] ; t15
-    mova                 m7, [rsp+ 1*%%str] ; t16
-    mova                 m6, [rsp+ 5*%%str] ; t17
-    mova                 m5, [rsp+ 9*%%str] ; t18
-    mova                 m4, [rsp+13*%%str] ; t19
-    mova                 m3, [rsp+17*%%str] ; t20
-    mova                 m2, [rsp+21*%%str] ; t21
-    mova                 m1, [rsp+25*%%str] ; t22
+    mova    [tmpq+ 3*%%str], m7
+    mova    [tmpq+ 7*%%str], m6
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+15*%%str], m4
+    mova    [tmpq+19*%%str], m3
+    mova    [tmpq+23*%%str], m2
+    mova    [tmpq+27*%%str], m1
+    mova    [tmpq+31*%%str], m0
+
+    mova                m15, [tmpq+ 2*%%str] ; t8
+    mova                m14, [tmpq+ 6*%%str] ; t9
+    mova                m13, [tmpq+10*%%str] ; t10
+    mova                m12, [tmpq+14*%%str] ; t11
+    mova                m11, [tmpq+18*%%str] ; t12
+    mova                m10, [tmpq+22*%%str] ; t13
+    mova                 m9, [tmpq+26*%%str] ; t14
+    mova                 m8, [tmpq+30*%%str] ; t15
+    mova                 m7, [tmpq+ 1*%%str] ; t16
+    mova                 m6, [tmpq+ 5*%%str] ; t17
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+21*%%str] ; t21
+    mova                 m1, [tmpq+25*%%str] ; t22
 
     SUMSUB_BA             w,  7,  8, 0
-    mova     [rsp+ 2*%%str], m8
-    mova                 m0, [rsp+29*%%str] ; t23
+    mova    [tmpq+ 2*%%str], m8
+    mova                 m0, [tmpq+29*%%str] ; t23
     SUMSUB_BA             w,  6,  9, 8
     SUMSUB_BA             w,  5, 10, 8
     SUMSUB_BA             w,  4, 11, 8
@@ -1462,29 +1458,29 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     SUMSUB_BA             w,  0, 15, 8
 
     TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
-    mova     [rsp+ 1*%%str], m0
-    mova     [rsp+ 5*%%str], m1
-    mova     [rsp+ 9*%%str], m2
-    mova     [rsp+13*%%str], m3
-    mova     [rsp+17*%%str], m4
-    mova     [rsp+21*%%str], m5
-    mova     [rsp+25*%%str], m6
-    mova     [rsp+29*%%str], m7
-
-    mova                 m8, [rsp+ 2*%%str]
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+17*%%str], m4
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m8, [tmpq+ 2*%%str]
     TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
-    mova     [rsp+ 2*%%str], m8
-    mova     [rsp+ 6*%%str], m9
-    mova     [rsp+10*%%str], m10
-    mova     [rsp+14*%%str], m11
-    mova     [rsp+18*%%str], m12
-    mova     [rsp+22*%%str], m13
-    mova     [rsp+26*%%str], m14
-    mova     [rsp+30*%%str], m15
+    mova    [tmpq+ 2*%%str], m8
+    mova    [tmpq+ 6*%%str], m9
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+    mova    [tmpq+18*%%str], m12
+    mova    [tmpq+22*%%str], m13
+    mova    [tmpq+26*%%str], m14
+    mova    [tmpq+30*%%str], m15
 %else
-    ; t0-7 is in [rsp+{0,4,8,12,16,20,24,28}*%%str]
-    ; t8-15 is in [rsp+{2,6,10,14,18,22,26,30}*%%str]
-    ; t16-19 and t23 is in [rsp+{1,5,9,13,29}*%%str]
+    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
     ; t20-22 is in m4-6
     ; t24-31 is in m8-15
     pxor                m7, m7
@@ -1507,55 +1503,55 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
 %endmacro
 
     ; store t0-1 and t30-31
-    mova                m0, [rsp+ 0*%%str]
-    mova                m1, [rsp+ 4*%%str]
+    mova                m0, [tmpq+ 0*%%str]
+    mova                m1, [tmpq+ 4*%%str]
     %%STORE_2X2          0,  1, 14, 15, 2, 3, 7
 
     ; store t2-3 and t28-29
-    mova                m0, [rsp+ 8*%%str]
-    mova                m1, [rsp+12*%%str]
+    mova                m0, [tmpq+ 8*%%str]
+    mova                m1, [tmpq+12*%%str]
     %%STORE_2X2          0,  1, 12, 13, 2, 3, 7
 
     ; store t4-5 and t26-27
-    mova                m0, [rsp+16*%%str]
-    mova                m1, [rsp+20*%%str]
+    mova                m0, [tmpq+16*%%str]
+    mova                m1, [tmpq+20*%%str]
     %%STORE_2X2          0,  1, 10, 11, 2, 3, 7
 
     ; store t6-7 and t24-25
-    mova                m0, [rsp+24*%%str]
-    mova                m1, [rsp+28*%%str]
+    mova                m0, [tmpq+24*%%str]
+    mova                m1, [tmpq+28*%%str]
     %%STORE_2X2          0,  1,  8,  9, 2, 3, 7
 
     ; store t8-9 and t22-23
-    mova                m0, [rsp+ 2*%%str]
-    mova                m1, [rsp+ 6*%%str]
-    mova                m8, [rsp+29*%%str]
+    mova                m0, [tmpq+ 2*%%str]
+    mova                m1, [tmpq+ 6*%%str]
+    mova                m8, [tmpq+29*%%str]
     %%STORE_2X2          0,  1,  6,  8, 2, 3, 7
 
     ; store t10-11 and t20-21
-    mova                m0, [rsp+10*%%str]
-    mova                m1, [rsp+14*%%str]
+    mova                m0, [tmpq+10*%%str]
+    mova                m1, [tmpq+14*%%str]
     %%STORE_2X2          0,  1,  4,  5, 2, 3, 7
 
     ; store t12-13 and t18-19
-    mova                m0, [rsp+18*%%str]
-    mova                m1, [rsp+22*%%str]
-    mova                m5, [rsp+13*%%str]
-    mova                m4, [rsp+ 9*%%str]
+    mova                m0, [tmpq+18*%%str]
+    mova                m1, [tmpq+22*%%str]
+    mova                m5, [tmpq+13*%%str]
+    mova                m4, [tmpq+ 9*%%str]
     %%STORE_2X2          0,  1,  4,  5, 2, 3, 7
 
     ; store t14-17
-    mova                m0, [rsp+26*%%str]
-    mova                m1, [rsp+30*%%str]
-    mova                m5, [rsp+ 5*%%str]
-    mova                m4, [rsp+ 1*%%str]
+    mova                m0, [tmpq+26*%%str]
+    mova                m1, [tmpq+30*%%str]
+    mova                m5, [tmpq+ 5*%%str]
+    mova                m4, [tmpq+ 1*%%str]
     %%STORE_2X2          0,  1,  4,  5, 2, 3, 7, 0
 %endif
 %endmacro
 
 %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
 INIT_XMM %1
-cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
     cmp eobd, 135
     jg .idctfull
     cmp eobd, 34
@@ -1580,8 +1576,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
     VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
     RET
 
-    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
 .idct8x8:
+    mov               tmpq, rsp
     VP9_IDCT32_1D   blockq, 1, 8
 
     mov          stride30q, strideq         ; stride
@@ -1592,12 +1589,11 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
 .loop2_8x8:
     mov               dstq, dst_bakq
     lea           dst_endq, [dst_bakq+stride30q]
-    VP9_IDCT32_1D      rsp, 2, 8
+    VP9_IDCT32_1D     tmpq, 2, 8
     add           dst_bakq, 8
-    add                rsp, 16
+    add               tmpq, 16
     dec               cntd
     jg .loop2_8x8
-    sub                rsp, 64
 
     ; at the end of the loop, m7 should still be zero
     ; use that to zero out block coefficients
@@ -1606,29 +1602,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
 
 .idct16x16:
     mov               cntd, 2
+    mov               tmpq, rsp
 .loop1_16x16:
     VP9_IDCT32_1D   blockq, 1, 16
     add             blockq, 16
-    add                rsp, 512
+    add               tmpq, 512
     dec               cntd
     jg .loop1_16x16
     sub             blockq, 32
-    sub                rsp, 1024
 
     mov          stride30q, strideq         ; stride
     lea           stride2q, [strideq*2]     ; stride*2
     shl          stride30q, 5               ; stride*32
     mov               cntd, 4
+    mov               tmpq, rsp
     sub          stride30q, stride2q        ; stride*30
 .loop2_16x16:
     mov               dstq, dst_bakq
     lea           dst_endq, [dst_bakq+stride30q]
-    VP9_IDCT32_1D      rsp, 2, 16
+    VP9_IDCT32_1D     tmpq, 2, 16
     add           dst_bakq, 8
-    add                rsp, 16
+    add               tmpq, 16
     dec               cntd
     jg .loop2_16x16
-    sub                rsp, 64
 
     ; at the end of the loop, m7 should still be zero
     ; use that to zero out block coefficients
@@ -1637,29 +1633,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
 
 .idctfull:
     mov               cntd, 4
+    mov               tmpq, rsp
 .loop1_full:
     VP9_IDCT32_1D   blockq, 1
     add             blockq, 16
-    add                rsp, 512
+    add               tmpq, 512
     dec               cntd
     jg .loop1_full
     sub             blockq, 64
-    sub                rsp, 2048
 
     mov          stride30q, strideq         ; stride
     lea           stride2q, [strideq*2]     ; stride*2
     shl          stride30q, 5               ; stride*32
     mov               cntd, 4
+    mov               tmpq, rsp
     sub          stride30q, stride2q        ; stride*30
 .loop2_full:
     mov               dstq, dst_bakq
     lea           dst_endq, [dst_bakq+stride30q]
-    VP9_IDCT32_1D      rsp, 2
+    VP9_IDCT32_1D     tmpq, 2
     add           dst_bakq, 8
-    add                rsp, 16
+    add               tmpq, 16
     dec               cntd
     jg .loop2_full
-    sub                rsp, 64
 
     ; at the end of the loop, m7 should still be zero
     ; use that to zero out block coefficients
-- 
1.8.4



More information about the ffmpeg-devel mailing list