[FFmpeg-devel] [PATCH] x86inc: support stack mem allocation and re-alignment in PROLOGUE.
Ronald S. Bultje
rsbultje at gmail.com
Sat Oct 6 23:42:29 CEST 2012
From: "Ronald S. Bultje" <rsbultje at gmail.com>
Use this in VP8/H264-8bit loopfilter functions so they can be used if
there is no aligned stack (e.g. MSVC 32bit or ICC 10.x).
---
libavcodec/x86/h264_deblock.asm | 27 ++-----
libavcodec/x86/h264dsp_init.c | 4 +-
libavcodec/x86/vp8dsp.asm | 68 ++++++++--------
libavcodec/x86/vp8dsp_init.c | 8 --
libavutil/x86/x86inc.asm | 167 +++++++++++++++++++++++++++++++++-------
5 files changed, 181 insertions(+), 93 deletions(-)
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 940a8f7..43aaf6d 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -399,14 +399,12 @@ DEBLOCK_LUMA
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_8, 5,5
+cglobal deblock_%1_luma_8, 5, 6 - HAVE_ALIGNED_STACK, 0, 2 * %2
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
- %assign pad 2*%2+12-(stack_offset&15)
- SUB esp, pad
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
@@ -444,22 +442,19 @@ cglobal deblock_%1_luma_8, 5,5
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
- ADD esp, pad
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma_8, 0,5
+cglobal deblock_h_luma_8, 0,6 - HAVE_ALIGNED_STACK, 0, 0x70
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
lea r1, [r0+r4]
- %assign pad 0x78-(stack_offset&15)
- SUB esp, pad
-%define pix_tmp esp+12
+%define pix_tmp esp+16
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
@@ -501,7 +496,6 @@ cglobal deblock_h_luma_8, 0,5
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
- ADD esp, pad
RET
%endmacro ; DEBLOCK_LUMA
@@ -632,7 +626,7 @@ DEBLOCK_LUMA v, 16
%define mpb_0 m14
%define mpb_1 m15
%else
- %define spill(x) [esp+16*x+((stack_offset+4)&15)]
+ %define spill(x) [esp+16*x]
%define p2 [r4+r1]
%define q2 [r0+2*r1]
%define t4 spill(0)
@@ -647,10 +641,7 @@ DEBLOCK_LUMA v, 16
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_%1_luma_intra_8, 4,6,16
-%if ARCH_X86_64 == 0
- sub esp, 0x60
-%endif
+cglobal deblock_%1_luma_intra_8, 4, 6, 16, ARCH_X86_32 * 0x50
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1
@@ -699,9 +690,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
-%if ARCH_X86_64 == 0
- add esp, 0x60
-%endif
RET
INIT_MMX cpuname
@@ -738,12 +726,10 @@ cglobal deblock_h_luma_intra_8, 4,9
add rsp, 0x88
RET
%else
-cglobal deblock_h_luma_intra_8, 2,4
+cglobal deblock_h_luma_intra_8, 2, 5 - HAVE_ALIGNED_STACK, 0, 0x80
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
-%assign pad 0x8c-(stack_offset&15)
- SUB rsp, pad
%define pix_tmp rsp
; transpose 8x16 -> tmp space
@@ -774,7 +760,6 @@ cglobal deblock_h_luma_intra_8, 2,4
lea r0, [r0+r1*8]
lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
- ADD rsp, pad
RET
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 3f6ded4..8596f26 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -275,18 +275,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
-#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
-#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSSE3(mm_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
}
- if (EXTERNAL_AVX(mm_flags) && HAVE_ALIGNED_STACK) {
+ if (EXTERNAL_AVX(mm_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index ab58e95..036b81b 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1632,28 +1632,31 @@ SIMPLE_LOOPFILTER h, 5
;-----------------------------------------------------------------------------
%macro INNER_LOOPFILTER 2
+%define stack_size 0
+%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
+%ifidn %1, v ; [3]=hev() result
+%define stack_size mmsize * 4
+%else ; h ; extra storage space for transposes
+%define stack_size mmsize * 5
+%endif
+%endif
+
%if %2 == 8 ; chroma
-cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr
+cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
%else ; luma
-cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
+cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
%endif
%if cpuflag(ssse3)
pxor m7, m7
%endif
-%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
-%ifidn %1, v ; [3]=hev() result
-%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
-%else ; h ; extra storage space for transposes
-%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
-%endif
+
+%ifndef m8
; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh
- SUB rsp, pad
-
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
@@ -2083,12 +2086,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
dec cntrq
jg .next8px
%endif
-%endif
-
-%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
- ADD rsp, pad
-%endif
+ REP_RET
+%else ; mmsize == 16
RET
+%endif
%endmacro
%if ARCH_X86_32
@@ -2123,31 +2124,34 @@ INNER_LOOPFILTER h, 8
;-----------------------------------------------------------------------------
%macro MBEDGE_LOOPFILTER 2
-%if %2 == 8 ; chroma
-cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
-%else ; luma
-cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
-%endif
-
-%if cpuflag(ssse3)
- pxor m7, m7
-%endif
+%define stack_size 0
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%if mmsize == 16 ; [3]=hev() result
; [4]=filter tmp result
; [5]/[6] = p2/q2 backup
; [7]=lim_res sign result
-%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15)
+%define stack_size mmsize * 7
%else ; 8 ; extra storage space for transposes
-%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15)
+%define stack_size mmsize * 8
+%endif
%endif
+
+%if %2 == 8 ; chroma
+cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
+%else ; luma
+cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
+%endif
+
+%if cpuflag(ssse3)
+ pxor m7, m7
+%endif
+
+%ifndef m8
; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh
- SUB rsp, pad
-
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
@@ -2741,12 +2745,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt
dec cntrq
jg .next8px
%endif
-%endif
-
-%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
- ADD rsp, pad
-%endif
+ REP_RET
+%else ; mmsize == 16
RET
+%endif
%endmacro
%if ARCH_X86_32
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index 38ad0c7..c9f8d32 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -390,13 +390,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
-#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
-#endif
}
if (mm_flags & AV_CPU_FLAG_SSE2) {
@@ -404,13 +402,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
-#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
-#endif
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
@@ -424,7 +420,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
-#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
@@ -434,17 +429,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
-#endif
}
if (mm_flags & AV_CPU_FLAG_SSE4) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
-#if ARCH_X86_64 || HAVE_ALIGNED_STACK
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
-#endif
}
#endif /* HAVE_YASM */
}
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index d734c6e..1c81527 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -109,7 +109,15 @@ CPUNOP amdnop
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = list of names to define to registers
+; %4 = (optional) an amount of aligned memory to be reserved ont he stack. If
+; not already, the stack will be aligned by 16 (MMX/XMM) or 32 (YMM)
+; bytes, and at least that number of bytes will be reserved on the stack.
+; The stack pointer will be automatically reset in RET.
+; If the stack was manually aligned (HAVE_ALIGNED_STACK == 0 || YMM), the
+; original stack pointer will be placed in the last register available to
+; this function (for PROLOGUE 1, 3: r2) and in [rsp+stack_size_padded],
+; so that you can still use memory arguments (i.e. rNm/rNmp).
+; %4/%5 = list of names to define to registers.
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
@@ -145,10 +153,10 @@ CPUNOP amdnop
%define r%1m %2d
%define r%1mp %2
%elif ARCH_X86_64 ; memory
- %define r%1m [rsp + stack_offset + %3]
+ %define r%1m [rSTK + stack_offset + %3]
%define r%1mp qword r %+ %1 %+ m
%else
- %define r%1m [esp + stack_offset + %3]
+ %define r%1m [rSTK + stack_offset + %3]
%define r%1mp dword r %+ %1 %+ m
%endif
%define r%1 %2
@@ -210,12 +218,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro PUSH 1
push %1
- %assign stack_offset stack_offset+gprsize
+ %ifidn rSTK, rsp
+ %assign stack_offset stack_offset+gprsize
+ %endif
%endmacro
%macro POP 1
pop %1
- %assign stack_offset stack_offset-gprsize
+ %ifidn rSTK, rsp
+ %assign stack_offset stack_offset-gprsize
+ %endif
%endmacro
%macro PUSH_IF_USED 1-*
@@ -248,14 +260,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro SUB 2
sub %1, %2
%ifidn %1, rsp
- %assign stack_offset stack_offset+(%2)
+ %ifidn rSTK, rsp
+ %assign stack_offset stack_offset+(%2)
+ %endif
%endif
%endmacro
%macro ADD 2
add %1, %2
%ifidn %1, rsp
- %assign stack_offset stack_offset-(%2)
+ %ifidn rSTK, rsp
+ %assign stack_offset stack_offset-(%2)
+ %endif
%endif
%endmacro
@@ -312,6 +328,43 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%assign n_arg_names %0
%endmacro
+%macro ALLOC_STACK 1-2 ; stack_size, n_xmm_regs (for win64 only)
+ ASSERT %1 > 0
+ %assign stack_size_alignment ((mmsize + 8) & ~8)
+ %assign stack_size_aligned (%1 + stack_size_alignment - 1) & ~(stack_size_alignment - 1)
+ %if %0 == 2
+ %assign xmm_regs_used %2
+ %else
+ %assign xmm_regs_used 0
+ %endif
+ %if mmsize <= 16 && HAVE_ALIGNED_STACK
+ %assign stack_size_padded stack_size_aligned + stack_size_alignment - gprsize - (stack_offset & (stack_size_alignment - 1))
+ %if xmm_regs_used > 6
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
+ %endif
+ SUB rsp, stack_size_padded
+ %else
+ %assign reg_num (regs_used - 1)
+ %xdefine rSTK r %+ reg_num
+ ; align stack, and save original stack location directly above it, i.e.
+ ; in [rsp+stack_size_padded], so we can restore the stack in a single
+ ; instruction (i.e. mov rsp, [rsp+stack_size_padded])
+ mov rSTK, rsp
+ %assign stack_size_padded stack_size_aligned
+ %if xmm_regs_used > 6
+ %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
+ %endif
+ sub rsp, gprsize+stack_size_padded
+ and rsp, ~(stack_size_alignment-1)
+ ; TODO(rbultje) if this function is not using rSTK for anything else,
+ ; there is no need to store it in the stack here
+ mov [rsp+stack_size_padded], rSTK
+ %endif
+ %if xmm_regs_used > 6
+ WIN64_PUSH_XMM
+ %endif
+%endmacro
+
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx
@@ -330,31 +383,46 @@ DECLARE_REG 12, R13, 104
DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
ASSERT regs_used <= 15
PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
- %if mmsize == 8
- %assign xmm_regs_used 0
- %else
+ %assign xmm_regs_used 0
+ %ifnum %4
+ %if %4 > 0
+ ALLOC_STACK %4, %3
+ %endif
+ %endif
+ %if mmsize != 8 && stack_size_aligned == 0
WIN64_SPILL_XMM %3
%endif
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
- DEFINE_ARGS %4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %elif %0 == 4
+ DEFINE_ARGS %4
+ %elif %0 > 4
+ DEFINE_ARGS %4, %5
+ %endif
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa [rsp + (%%i-6)*16 + stack_size_aligned], xmm %+ %%i
+ %endrep
%endmacro
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
- SUB rsp, (xmm_regs_used-6)*16+16
- %assign %%i xmm_regs_used
- %rep (xmm_regs_used-6)
- %assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
- %endrep
+ %assign stack_size_padded (xmm_regs_used-6)*16+16-gprsize-(stack_offset&15)
+ SUB rsp, stack_size_padded
+ WIN64_PUSH_XMM
%endif
%endmacro
@@ -363,19 +431,23 @@ DECLARE_REG 14, R15, 120
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size_aligned]
%endrep
- add %1, (xmm_regs_used-6)*16+16
+ %endif
+ %if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+ mov rsp, [rsp+stack_size_padded]
+ %else
+ add %1, stack_size_padded
%endif
%endmacro
%macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+ %assign stack_offset (stack_offset-stack_size_padded)
%assign xmm_regs_used 0
%endmacro
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size_aligned > 0
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
@@ -404,19 +476,37 @@ DECLARE_REG 12, R13, 56
DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
ASSERT regs_used <= 15
PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ %ifnum %4
+ %if %4 > 0
+ ALLOC_STACK %4
+ %endif
+ %endif
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
- DEFINE_ARGS %4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %elif %0 == 4
+ DEFINE_ARGS %4
+ %elif %0 > 4
+ DEFINE_ARGS %4, %5
+ %endif
%endmacro
-%define has_epilogue regs_used > 9 || mmsize == 32
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size_aligned > 0
%macro RET 0
+%if stack_size_aligned > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+ mov rsp, [rsp+stack_size_padded]
+%else
+ add rsp, stack_size_padded
+%endif
+%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
vzeroupper
@@ -445,7 +535,7 @@ DECLARE_REG 6, ebp, 28
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, arg_names...
%assign num_args %1
%assign regs_used %2
%if num_args > 7
@@ -456,13 +546,31 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
ASSERT regs_used >= num_args
PUSH_IF_USED 3, 4, 5, 6
+ %ifnum %4
+ %if %4 > 0
+ ALLOC_STACK %4
+ %endif
+ %endif
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
- DEFINE_ARGS %4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %elif %0 == 4
+ DEFINE_ARGS %4
+ %elif %0 > 4
+ DEFINE_ARGS %4, %5
+ %endif
%endmacro
-%define has_epilogue regs_used > 3 || mmsize == 32
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size_aligned > 0
%macro RET 0
+%if stack_size_aligned > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+ mov rsp, [rsp+stack_size_padded]
+%else
+ add rsp, stack_size_padded
+%endif
+%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
vzeroupper
@@ -524,7 +632,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
align function_align
%1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %xdefine rSTK rsp
%assign stack_offset 0
+ %assign stack_size_aligned 0
+ %assign stack_size_padded 0
%ifnidn %2, ""
PROLOGUE %2
%endif
--
1.7.11.3
More information about the ffmpeg-devel
mailing list