[FFmpeg-cvslog] r24044 - in trunk/libavcodec/ppc: fft_altivec.c fft_altivec_s.S
mru
subversion
Sun Jul 4 20:33:47 CEST 2010
Author: mru
Date: Sun Jul 4 20:33:47 2010
New Revision: 24044
Log:
PPC: convert Altivec FFT to pure assembler
On PPC a leaf function has a 288-byte red zone below the stack pointer,
sparing these functions the chore of setting up a full stack frame.
When a function call is disguised within an inline asm block, the
compiler might not adjust the stack pointer as required before a
function call, resulting in the red zone being clobbered.
Moving the entire function to pure asm avoids this problem and also
results in somewhat better code.
Modified:
trunk/libavcodec/ppc/fft_altivec.c
trunk/libavcodec/ppc/fft_altivec_s.S
Modified: trunk/libavcodec/ppc/fft_altivec.c
==============================================================================
--- trunk/libavcodec/ppc/fft_altivec.c Sun Jul 4 20:33:43 2010 (r24043)
+++ trunk/libavcodec/ppc/fft_altivec.c Sun Jul 4 20:33:47 2010 (r24044)
@@ -22,7 +22,6 @@
#include "libavcodec/fft.h"
#include "util_altivec.h"
#include "types_altivec.h"
-#include "regs.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
@@ -33,51 +32,10 @@
* It also assumes all FFTComplex are 8 bytes-aligned pair of float
*/
-// Pointers to functions. Not using function pointer syntax, because
-// that involves an extra level of indirection on some PPC ABIs.
-extern void *ff_fft_dispatch_altivec[2][15];
+void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
#if HAVE_GNU_AS
-static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle)
-{
- register vec_f v14 __asm__("v14") = {0,0,0,0};
- register vec_f v15 __asm__("v15") = *(const vec_f*)ff_cos_16;
- register vec_f v16 __asm__("v16") = {0, 0.38268343, M_SQRT1_2, 0.92387953};
- register vec_f v17 __asm__("v17") = {-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2};
- register vec_f v18 __asm__("v18") = { M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2};
- register vec_u8 v19 __asm__("v19") = vcprm(s0,3,2,1);
- register vec_u8 v20 __asm__("v20") = vcprm(0,1,s2,s1);
- register vec_u8 v21 __asm__("v21") = vcprm(2,3,s0,s3);
- register vec_u8 v22 __asm__("v22") = vcprm(2,s3,3,s2);
- register vec_u8 v23 __asm__("v23") = vcprm(0,1,s0,s1);
- register vec_u8 v24 __asm__("v24") = vcprm(2,3,s2,s3);
- register vec_u8 v25 __asm__("v25") = vcprm(2,3,0,1);
- register vec_u8 v26 __asm__("v26") = vcprm(1,2,s3,s0);
- register vec_u8 v27 __asm__("v27") = vcprm(0,3,s2,s1);
- register vec_u8 v28 __asm__("v28") = vcprm(0,2,s1,s3);
- register vec_u8 v29 __asm__("v29") = vcprm(1,3,s0,s2);
- register FFTSample *const*cos_tabs __asm__("r12") = ff_cos_tabs;
- register FFTComplex *zarg __asm__("r3") = z;
- __asm__(
- "mtctr %0 \n"
- "li "r(9)", 16 \n"
- "subi "r(1)","r(1) ",%1 \n"
- "bctrl \n"
- "addi "r(1)","r(1) ",%1 \n"
- ::"r"(ff_fft_dispatch_altivec[do_swizzle][s->nbits-2]), "i"(12*sizeof(void*)),
- "r"(zarg), "r"(cos_tabs),
- "v"(v14),"v"(v15),"v"(v16),"v"(v17),"v"(v18),"v"(v19),"v"(v20),"v"(v21),
- "v"(v22),"v"(v23),"v"(v24),"v"(v25),"v"(v26),"v"(v27),"v"(v28),"v"(v29)
- : "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11",
- "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13"
- );
-}
-
-static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
-{
- fft_dispatch(s, z, 1);
-}
-
static void ff_imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int j, k;
@@ -132,7 +90,7 @@ static void ff_imdct_half_altivec(FFTCon
k--;
} while(k >= 0);
- fft_dispatch(s, (FFTComplex*)output, 0);
+ ff_fft_calc_altivec(s, (FFTComplex*)output);
/* post rotation + reordering */
j = -n32;
@@ -182,7 +140,7 @@ static void ff_imdct_calc_altivec(FFTCon
av_cold void ff_fft_init_altivec(FFTContext *s)
{
#if HAVE_GNU_AS
- s->fft_calc = ff_fft_calc_altivec;
+ s->fft_calc = ff_fft_calc_interleave_altivec;
s->imdct_calc = ff_imdct_calc_altivec;
s->imdct_half = ff_imdct_half_altivec;
#endif
Modified: trunk/libavcodec/ppc/fft_altivec_s.S
==============================================================================
--- trunk/libavcodec/ppc/fft_altivec_s.S Sun Jul 4 20:33:43 2010 (r24043)
+++ trunk/libavcodec/ppc/fft_altivec_s.S Sun Jul 4 20:33:47 2010 (r24044)
@@ -49,24 +49,6 @@
.endif
.endm
-#if ARCH_PPC64
-#define PTR .quad
-.macro LOAD_PTR ra, rbase, offset
- ld \ra,(\offset)*8(\rbase)
-.endm
-.macro STORE_PTR ra, rbase, offset
- std \ra,(\offset)*8(\rbase)
-.endm
-#else
-#define PTR .int
-.macro LOAD_PTR ra, rbase, offset
- lwz \ra,(\offset)*4(\rbase)
-.endm
-.macro STORE_PTR ra, rbase, offset
- stw \ra,(\offset)*4(\rbase)
-.endm
-#endif
-
.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
@@ -314,18 +296,105 @@ fft_pass\suffix\()_altivec:
blr
.endm
+#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
+
+ .rodata
+ .align 4
+fft_data:
+ .float 0, 0, 0, 0
+ .float 1, 0.92387953, M_SQRT1_2, 0.38268343
+ .float 0, 0.38268343, M_SQRT1_2, 0.92387953
+ .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
+ .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
+ vcprm(s0,3,2,1)
+ vcprm(0,1,s2,s1)
+ vcprm(2,3,s0,s3)
+ vcprm(2,s3,3,s2)
+ vcprm(0,1,s0,s1)
+ vcprm(2,3,s2,s3)
+ vcprm(2,3,0,1)
+ vcprm(1,2,s3,s0)
+ vcprm(0,3,s2,s1)
+ vcprm(0,2,s1,s3)
+ vcprm(1,3,s0,s2)
+
+.macro lvm b, r, regs:vararg
+ lvx \r, 0, \b
+ addi \b, \b, 16
+ .ifnb \regs
+ lvm \b, \regs
+ .endif
+.endm
+
+.macro stvm b, r, regs:vararg
+ stvx \r, 0, \b
+ addi \b, \b, 16
+ .ifnb \regs
+ stvm \b, \regs
+ .endif
+.endm
+
+.macro fft_calc interleave
+extfunc ff_fft_calc\interleave\()_altivec
+ mflr r0
+ stp r0, 2*PS(r1)
+ stpu r1, -(160+16*PS)(r1)
+ addi r6, r1, 16*PS
+ stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ mfvrsave r0
+ stw r0, 15*PS(r1)
+ li r6, 0xfffffffc
+ mtvrsave r6
+
+ movrel r6, fft_data
+ lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
+ lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
+
+ li r9, 16
+ movrel r12, X(ff_cos_tabs)
+
+ movrel r6, fft_dispatch_tab\interleave\()_altivec
+ lwz r3, 0(r3)
+ subi r3, r3, 2
+ slwi r3, r3, 2+ARCH_PPC64
+ lpx r3, r3, r6
+ mtctr r3
+ mr r3, r4
+ bctrl
+
+ addi r6, r1, 16*PS
+ lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ lwz r6, 15*PS(r1)
+ mtvrsave r6
+ lp r1, 0(r1)
+ lp r0, 2*PS(r1)
+ mtlr r0
+ blr
+.endm
+
.macro DECL_FFT suffix, bits, n, n2, n4
fft\n\suffix\()_altivec:
mflr r0
- STORE_PTR r0,r1,\bits-5
+ stp r0,PS*(\bits-3)(r1)
bl fft\n2\()_altivec
addi2 r3,\n*4
bl fft\n4\()_altivec
addi2 r3,\n*2
bl fft\n4\()_altivec
addi2 r3,\n*-6
- LOAD_PTR r0,r1,\bits-5
- LOAD_PTR r4,r12,\bits
+ lp r0,PS*(\bits-3)(r1)
+ lp r4,\bits*PS(r12)
mtlr r0
li r5,\n/16
b fft_pass\suffix\()_altivec
@@ -350,9 +419,11 @@ fft\n\suffix\()_altivec:
DECL_FFT \suffix,15,32768,16384, 8192
DECL_FFT \suffix,16,65536,32768,16384
+ fft_calc \suffix
+
.rodata
- .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
-EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
+ .align 3
+fft_dispatch_tab\suffix\()_altivec:
PTR fft4\suffix\()_altivec
PTR fft8\suffix\()_altivec
PTR fft16\suffix\()_altivec
More information about the ffmpeg-cvslog
mailing list