[FFmpeg-devel] [PATCH 2/4] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

Fri Jul 11 01:14:29 CEST 2014

The previous implementation targeted DTS Coherent Acoustics, which only
requires nbits == 4 (fft16()). This case was (and still is) linked directly
rather than being indirected through ff_fft_calc_vfp(), but now the full
range from radix-4 up to radix-65536 is available. This benefits other codecs
such as AAC and AC3.

The implementaion is based upon the C version, with each routine larger than
radix-16 calling a hierarchy of smaller FFT functions, then performing a
post-processing pass. This pass benefits a lot from loop unrolling to
counter the long pipelines in the VFP. A relaxed calling standard also
reduces the overhead of the call hierarchy, and avoiding the excessive
inlining performed by GCC probably helps with I-cache utilisation too.

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in the FFT routines (fft4() to fft512() and pass()) for the
same sample AAC stream:

              Before          After
              Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode  2245.5 53.1     1599.6 43.8    100.0%      +40.4%
FFT routines  940.6  22.0     348.1  20.8    100.0%      +170.2%
---
 libavcodec/arm/fft_init_arm.c |    8 +-
 libavcodec/arm/fft_vfp.S      |  261 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 252 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 7e49b9c..5087f5f 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,6 +23,8 @@
 #include "libavcodec/rdft.h"
 #include "libavcodec/synth_filter.h"
 
+void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
+
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
@@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_vfp(cpu_flags)) {
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
+        s->fft_calc     = ff_fft_calc_vfp;
 #if CONFIG_MDCT
-        if (!have_vfpv3(cpu_flags))
-            s->imdct_half   = ff_imdct_half_vfp;
+        s->imdct_half   = ff_imdct_half_vfp;
 #endif
     }
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index f1ab37c..65498a5 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -21,8 +21,9 @@
 
 #include "libavutil/arm/asm.S"
 
-@ TODO: * FFTs wider than 16
-@       * dispatch code
+@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
+@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
+@ all single-precision VFP registers may be corrupted on exit.
 
 function fft4_vfp
         vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
@@ -131,18 +132,22 @@ endfunc
              vstr    d9, [a1, #3 * 2*4]
 .endm
 
+function fft8_internal_vfp
+        macro_fft8_head
+        macro_fft8_tail
+        bx      lr
+endfunc
+
 function fft8_vfp
         ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
         fmrx    a2, FPSCR
         fmxr    FPSCR, a3
         vpush   {s16-s31}
-
-        macro_fft8_head
-        macro_fft8_tail
-
+        mov     ip, lr
+        bl      fft8_internal_vfp
         vpop    {s16-s31}
         fmxr    FPSCR, a2
-        bx      lr
+        bx      ip
 endfunc
 
 .align 3
@@ -153,12 +158,7 @@ cos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
 cos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
         .float  0.3826834261417388916015625
 
-function ff_fft16_vfp, export=1
-        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
-        fmrx    a2, FPSCR
-        fmxr    FPSCR, a3
-        vpush   {s16-s31}
-
+function fft16_internal_vfp
         macro_fft8_head
         @ FFT4(z+8)
         vldr    d10, [a1, #8 * 2*4]
@@ -292,7 +292,240 @@ function ff_fft16_vfp, export=1
               vstr    d8, [a1, #0 * 2*4]
               vstr    d9, [a1, #4 * 2*4]
 
+        bx      lr
+endfunc
+
+function ff_fft16_vfp, export=1
+        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
+        fmrx    a2, FPSCR
+        fmxr    FPSCR, a3
+        vpush   {s16-s31}
+        mov     ip, lr
+        bl      fft16_internal_vfp
         vpop    {s16-s31}
         fmxr    FPSCR, a2
-        bx      lr
+        bx      ip
+endfunc
+
+.macro pass n, z0, z1, z2, z3
+        add     v6, v5, #4*2*\n
+        @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
+            @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+                @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
+                    @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+            vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
+            vldmdb  v6!, {s2}
+            vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
+            vldmia  v5!, {s0,s1}                @ s0 is unused
+        vldr    s7, [\z2, #8*o2]            @ t1
+            vmul.f  s20, s16, s2                @ vector * scalar
+        vldr    s0, [\z3, #8*o3]            @ t5
+        vldr    s6, [\z2, #8*o2+4]          @ t2
+        vldr    s3, [\z3, #8*o3+4]          @ t6
+            vmul.f  s16, s16, s1                @ vector * scalar
+        ldr     a4, =\n-1
+1:      add     \z0, \z0, #8*2
+ .if \n*4*2 >= 512
+        add     \z1, \z1, #8*2
+ .endif
+ .if \n*4*2 >= 256
+        add     \z2, \z2, #8*2
+ .endif
+ .if \n*4*2 >= 512
+        add     \z3, \z3, #8*2
+ .endif
+        @ up to 2 stalls (VFP vector issuing / waiting for s0)
+        @ depending upon whether this is the first iteration and
+        @ how many add instructions are inserted above
+        vadd.f  s4, s0, s7                  @ t5
+        vadd.f  s5, s6, s3                  @ t6
+        vsub.f  s6, s6, s3                  @ t4
+        vsub.f  s7, s0, s7                  @ t3
+        vldr    d6, [\z0, #8*0-8*2]         @ s12,s13
+            vadd.f  s0, s16, s21                @ t1
+        vldr    d7, [\z1, #8*o1-8*2]        @ s14,s15
+            vsub.f  s1, s18, s23                @ t5
+        vadd.f  s8, s4, s12                 @ vector + vector
+        @ stall (VFP vector issuing)
+        @ stall (VFP vector issuing)
+        @ stall (VFP vector issuing)
+        vsub.f  s4, s12, s4
+        vsub.f  s5, s13, s5
+        vsub.f  s6, s14, s6
+        vsub.f  s7, s15, s7
+            vsub.f  s2, s17, s20                @ t2
+            vadd.f  s3, s19, s22                @ t6
+        vstr    d4, [\z0, #8*0-8*2]         @ s8,s9
+        vstr    d5, [\z1, #8*o1-8*2]        @ s10,s11
+        @ stall (waiting for s5)
+        vstr    d2, [\z2, #8*o2-8*2]        @ s4,s5
+            vadd.f  s4, s1, s0                  @ t5
+        vstr    d3, [\z3, #8*o3-8*2]        @ s6,s7
+            vsub.f  s7, s1, s0                  @ t3
+            vadd.f  s5, s2, s3                  @ t6
+            vsub.f  s6, s2, s3                  @ t4
+            vldr    d6, [\z0, #8*1-8*2]         @ s12,s13
+            vldr    d7, [\z1, #8*(o1+1)-8*2]    @ s14,s15
+                vldr    d4, [\z2, #8*o2]            @ s8,s9
+                vldmdb  v6!, {s2,s3}
+                vldr    d5, [\z3, #8*o3]            @ s10,s11
+            vadd.f  s20, s4, s12                @ vector + vector
+                vldmia  v5!, {s0,s1}
+                    vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
+            @ stall (VFP vector issuing)
+            vsub.f  s4, s12, s4
+            vsub.f  s5, s13, s5
+            vsub.f  s6, s14, s6
+            vsub.f  s7, s15, s7
+                vmul.f  s12, s8, s3                 @ vector * scalar
+            vstr    d10, [\z0, #8*1-8*2]        @ s20,s21
+                    vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
+            vstr    d11, [\z1, #8*(o1+1)-8*2]   @ s22,s23
+                vmul.f  s8, s8, s0                  @ vector * scalar
+            vstr    d2, [\z2, #8*(o2+1)-8*2]    @ s4,s5
+            @ stall (waiting for s7)
+            vstr    d3, [\z3, #8*(o3+1)-8*2]    @ s6,s7
+                    vmul.f  s20, s16, s2                @ vector * scalar
+                @ stall (VFP vector issuing)
+                @ stall (VFP vector issuing)
+                @ stall (VFP vector issuing)
+                vadd.f  s7, s8, s13                 @ t1
+                vsub.f  s6, s9, s12                 @ t2
+                vsub.f  s0, s10, s15                @ t5
+                vadd.f  s3, s11, s14                @ t6
+                    vmul.f  s16, s16, s1                @ vector * scalar
+        subs    a4, a4, #1
+        bne     1b
+        @ What remains is identical to the first two indentations of
+        @ the above, but without the increment of z
+        vadd.f  s4, s0, s7                  @ t5
+        vadd.f  s5, s6, s3                  @ t6
+        vsub.f  s6, s6, s3                  @ t4
+        vsub.f  s7, s0, s7                  @ t3
+        vldr    d6, [\z0, #8*0]             @ s12,s13
+            vadd.f  s0, s16, s21                @ t1
+        vldr    d7, [\z1, #8*o1]            @ s14,s15
+            vsub.f  s1, s18, s23                @ t5
+        vadd.f  s8, s4, s12                 @ vector + vector
+        vsub.f  s4, s12, s4
+        vsub.f  s5, s13, s5
+        vsub.f  s6, s14, s6
+        vsub.f  s7, s15, s7
+            vsub.f  s2, s17, s20                @ t2
+            vadd.f  s3, s19, s22                @ t6
+        vstr    d4, [\z0, #8*0]             @ s8,s9
+        vstr    d5, [\z1, #8*o1]            @ s10,s11
+        vstr    d2, [\z2, #8*o2]            @ s4,s5
+            vadd.f  s4, s1, s0                  @ t5
+        vstr    d3, [\z3, #8*o3]            @ s6,s7
+            vsub.f  s7, s1, s0                  @ t3
+            vadd.f  s5, s2, s3                  @ t6
+            vsub.f  s6, s2, s3                  @ t4
+            vldr    d6, [\z0, #8*1]             @ s12,s13
+            vldr    d7, [\z1, #8*(o1+1)]        @ s14,s15
+            vadd.f  s20, s4, s12                @ vector + vector
+            vsub.f  s4, s12, s4
+            vsub.f  s5, s13, s5
+            vsub.f  s6, s14, s6
+            vsub.f  s7, s15, s7
+            vstr    d10, [\z0, #8*1]            @ s20,s21
+            vstr    d11, [\z1, #8*(o1+1)]       @ s22,s23
+            vstr    d2, [\z2, #8*(o2+1)]        @ s4,s5
+            vstr    d3, [\z3, #8*(o3+1)]        @ s6,s7
+.endm
+
+.macro fft_internal_vfp name, name2, name4, costable, n
+function \name
+ .if \n >= 512
+        push    {v1-v6,lr}
+ .elseif \n >= 256
+        push    {v1-v2,v5-v6,lr}
+ .else
+        push    {v1,v5-v6,lr}
+ .endif
+        mov     v1, a1
+        bl      \name2
+        add     a1, v1, #8*(\n/4)*2
+        bl      \name4
+        ldr     v5, =\costable
+        add     a1, v1, #8*(\n/4)*3
+        bl      \name4
+ .if \n >= 512
+  .set o1, 0*(\n/4/2)
+  .set o2, 0*(\n/4/2)
+  .set o3, 0*(\n/4/2)
+        add     v2, v1, #8*2*(\n/4/2)
+        add     v3, v1, #8*4*(\n/4/2)
+        add     v4, v1, #8*6*(\n/4/2)
+        pass    (\n/4/2), v1, v2, v3, v4
+        pop     {v1-v6,pc}
+ .elseif \n >= 256
+  .set o1, 2*(\n/4/2)
+  .set o2, 0*(\n/4/2)
+  .set o3, 2*(\n/4/2)
+        add     v2, v1, #8*4*(\n/4/2)
+        pass    (\n/4/2), v1, v1, v2, v2
+        pop     {v1-v2,v5-v6,pc}
+ .else
+  .set o1, 2*(\n/4/2)
+  .set o2, 4*(\n/4/2)
+  .set o3, 6*(\n/4/2)
+        pass    (\n/4/2), v1, v1, v1, v1
+        pop     {v1,v5-v6,pc}
+ .endif
+endfunc
+.endm
+
+#define DECL_FFT(n,n2,n4)               \
+fft_internal_vfp  fft##n##_internal_vfp, fft##n2##_internal_vfp, fft##n4##_internal_vfp, ff_cos_##n, n ;\
+                                       ;\
+function fft##n##_vfp                  ;\
+        ldr     a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ ;\
+        fmrx    a2, FPSCR              ;\
+        fmxr    FPSCR, a3              ;\
+        vpush   {s16-s31}              ;\
+        mov     ip, lr                 ;\
+        bl      fft##n##_internal_vfp  ;\
+        vpop    {s16-s31}              ;\
+        fmxr    FPSCR, a2              ;\
+        bx      ip                     ;\
+endfunc                                ;\
+                                       ;\
+.ltorg
+
+DECL_FFT(32,16,8)
+DECL_FFT(64,32,16)
+DECL_FFT(128,64,32)
+DECL_FFT(256,128,64)
+DECL_FFT(512,256,128)
+DECL_FFT(1024,512,256)
+DECL_FFT(2048,1024,512)
+DECL_FFT(4096,2048,1024)
+DECL_FFT(8192,4096,2048)
+DECL_FFT(16384,8192,4096)
+DECL_FFT(32768,16384,8192)
+DECL_FFT(65536,32768,16384)
+
+function ff_fft_calc_vfp, export=1
+        ldr     ip, [a1, #0]    @ nbits
+        mov     a1, a2
+        ldr     ip, [pc, ip, lsl #2]
+        bx      ip
+        .word   0
+        .word   0
+        .word   fft4_vfp
+        .word   fft8_vfp
+        .word   ff_fft16_vfp    @ this one alone is exported
+        .word   fft32_vfp
+        .word   fft64_vfp
+        .word   fft128_vfp
+        .word   fft256_vfp
+        .word   fft512_vfp
+        .word   fft1024_vfp
+        .word   fft2048_vfp
+        .word   fft4096_vfp
+        .word   fft8192_vfp
+        .word   fft16384_vfp
+        .word   fft32768_vfp
+        .word   fft65536_vfp
 endfunc
-- 
1.7.5.4