[FFmpeg-cvslog] x86inc: Utilize the shadow space on 64-bit Windows

Henrik Gramner git at videolan.org
Tue Oct 8 11:37:10 CEST 2013


ffmpeg | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Sep 11 17:49:26 2013 +0200| [bbe4a6db44f0b55b424a5cc9d3e89cd88e250450] | committer: Derek Buitenhuis

x86inc: Utilize the shadow space on 64-bit Windows

Store XMM6 and XMM7 in the shadow space in functions that
clobbers them. This way we don't have to adjust the stack
pointer as often, reducing the number of instructions as
well as code size.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bbe4a6db44f0b55b424a5cc9d3e89cd88e250450
---

 libavcodec/x86/fft.asm          |    4 +-
 libavcodec/x86/h264_deblock.asm |   19 ++++-----
 libavutil/x86/x86inc.asm        |   83 ++++++++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 51 deletions(-)

diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index c87752b..e4744a3 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -667,13 +667,13 @@ cglobal imdct_calc, 3,5,3
     push    r1
     push    r0
 %else
-    sub     rsp, 8
+    sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
 %endif
     call    r4
 %if ARCH_X86_32
     add     esp, 12
 %else
-    add     rsp, 8
+    add     rsp, 8+32*WIN64
 %endif
     POP     r1
     POP     r3
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index fc6c983..6e29ce7 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX cpuname
-cglobal deblock_h_luma_8, 5,9
+cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
     movsxd r7,  r1d
     lea    r8,  [r7+r7*2]
     lea    r6,  [r0-4]
     lea    r5,  [r0-4+r8]
 %if WIN64
-    sub    rsp, 0x98
-    %define pix_tmp rsp+0x30
+    %define pix_tmp rsp+0x30 ; shadow space + r4
 %else
-    sub    rsp, 0x68
     %define pix_tmp rsp
 %endif
 
@@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9
     movq   m3, [pix_tmp+0x40]
     TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
 
-%if WIN64
-    add    rsp, 0x98
-%else
-    add    rsp, 0x68
-%endif
     RET
 %endmacro
 
@@ -704,13 +697,16 @@ INIT_MMX cpuname
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_8, 4,9
+cglobal deblock_h_luma_intra_8, 4,9,0,0x80
     movsxd r7,  r1d
     lea    r8,  [r7*3]
     lea    r6,  [r0-4]
     lea    r5,  [r0-4+r8]
-    sub    rsp, 0x88
+%if WIN64
+    %define pix_tmp rsp+0x20 ; shadow space
+%else
     %define pix_tmp rsp
+%endif
 
     ; transpose 8x16 -> tmp space
     TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
@@ -730,7 +726,6 @@ cglobal deblock_h_luma_intra_8, 4,9
     sub    r5,  r7
     shr    r7,  3
     TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
-    add    rsp, 0x88
     RET
 %else
 cglobal deblock_h_luma_intra_8, 2,4,8,0x80
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 6878b35..a98f559 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -334,14 +334,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
             %if stack_size < 0
                 %assign stack_size -stack_size
             %endif
-            %if mmsize != 8
-                %assign xmm_regs_used %2
+            %assign stack_size_padded stack_size
+            %if WIN64
+                %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+                    %endif
+                %endif
             %endif
             %if mmsize <= 16 && HAVE_ALIGNED_STACK
-                %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
-                %if xmm_regs_used > 6
-                    %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
-                %endif
+                %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
                 SUB rsp, stack_size_padded
             %else
                 %assign %%reg_num (regs_used - 1)
@@ -351,14 +355,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
                 ; stack in a single instruction (i.e. mov rsp, rstk or mov
                 ; rsp, [rsp+stack_size_padded])
                 mov  rstk, rsp
-                %assign stack_size_padded stack_size
-                %if xmm_regs_used > 6
-                    %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
-                    %if mmsize == 32 && xmm_regs_used & 1
-                        ; re-align to 32 bytes
-                        %assign stack_size_padded (stack_size_padded + 16)
-                    %endif
-                %endif
                 %if %1 < 0 ; need to store rsp on stack
                     sub  rsp, gprsize+stack_size_padded
                     and  rsp, ~(%%stack_alignment-1)
@@ -370,9 +366,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
                     %xdefine rstkm rstk
                 %endif
             %endif
-            %if xmm_regs_used > 6
-                WIN64_PUSH_XMM
-            %endif
+            WIN64_PUSH_XMM
         %endif
     %endif
 %endmacro
@@ -433,40 +427,55 @@ DECLARE_REG 14, R15, 120
 %endmacro
 
 %macro WIN64_PUSH_XMM 0
-    %assign %%i xmm_regs_used
-    %rep (xmm_regs_used-6)
-        %assign %%i %%i-1
-        movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
-    %endrep
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
 %endmacro
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
     ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 6
-        SUB rsp, (xmm_regs_used-6)*16+16
-        WIN64_PUSH_XMM
+    %if xmm_regs_used > 8
+        %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+        SUB rsp, stack_size_padded
     %endif
+    WIN64_PUSH_XMM
 %endmacro
 
 %macro WIN64_RESTORE_XMM_INTERNAL 1
-    %if xmm_regs_used > 6
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
         %assign %%i xmm_regs_used
-        %rep (xmm_regs_used-6)
+        %rep xmm_regs_used-8
             %assign %%i %%i-1
-            movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
         %endrep
-        %if stack_size_padded == 0
-            add %1, (xmm_regs_used-6)*16+16
-        %endif
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
             mov rsp, rstkm
         %else
             add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
         %endif
     %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %endif
 %endmacro
 
 %macro WIN64_RESTORE_XMM 1
@@ -683,12 +692,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %endif
     align function_align
     %2:
-    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
-    %xdefine rstk rsp
-    %assign stack_offset 0
-    %assign stack_size 0
-    %assign stack_size_padded 0
-    %assign xmm_regs_used 0
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
     %ifnidn %3, ""
         PROLOGUE %3
     %endif



More information about the ffmpeg-cvslog mailing list