[FFmpeg-cvslog] dcadsp: split lfe_dir cases

Sat Feb 8 02:54:13 CET 2014

ffmpeg | branch: master | Christophe Gisquet <christophe.gisquet at gmail.com> | Wed Feb  5 23:40:52 2014 +0000| [5fdbfcb5b793f5849c496214668094a8ec99fa07] | committer: Janne Grunau

dcadsp: split lfe_dir cases

The x86 runs short on registers because numerous elements are not static.
In addition, splitting them allows more optimized code, at least for x86.

Arm asm changes by Janne Grunau.

Signed-off-by: Janne Grunau <janne-libav at jannau.net>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5fdbfcb5b793f5849c496214668094a8ec99fa07
---

 libavcodec/arm/dcadsp_init_arm.c |   23 ++++++++++++++++-------
 libavcodec/arm/dcadsp_neon.S     |   18 ++++++++++++------
 libavcodec/arm/dcadsp_vfp.S      |   32 ++++++++++++--------------------
 libavcodec/dcadec.c              |   10 +++++-----
 libavcodec/dcadsp.c              |   20 +++++++++++++++++---
 libavcodec/dcadsp.h              |    4 ++--
 6 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index d49a176..2ea1289 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -24,16 +24,22 @@
 #include "libavutil/attributes.h"
 #include "libavcodec/dcadsp.h"
 
-void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
-                        int decifactor, float scale);
+void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs,
+                          float scale);
+
+void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs,
+                          float scale);
+
 void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
                                 SynthFilterContext *synth, FFTContext *imdct,
                                 float synth_buf_ptr[512],
                                 int *synth_buf_offset, float synth_buf2[32],
                                 const float window[512], float *samples_out,
                                 float raXin[32], float scale);
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
-                         int decifactor, float scale);
 
 void ff_synth_filter_float_vfp(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
@@ -52,11 +58,14 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
-        s->lfe_fir = ff_dca_lfe_fir_vfp;
+        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
+        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
         s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
     }
-    if (have_neon(cpu_flags))
-        s->lfe_fir = ff_dca_lfe_fir_neon;
+    if (have_neon(cpu_flags)) {
+        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
+        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+    }
 }
 
 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index fe3aae8..c798fea 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -20,17 +20,23 @@
 
 #include "libavutil/arm/asm.S"
 
-function ff_dca_lfe_fir_neon, export=1
+function ff_dca_lfe_fir0_neon, export=1
         push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #32                @ decifactor
+        mov             r6,  #256/32
+        b               dca_lfe_fir
+endfunc
 
+function ff_dca_lfe_fir1_neon, export=1
+        push            {r4-r6,lr}
+NOVFP   vmov            s0,  r3                 @ scale
+        mov             r3,  #64                @ decifactor
+        mov             r6,  #256/64
+dca_lfe_fir:
         add             r4,  r0,  r3,  lsl #2   @ out2
         add             r5,  r2,  #256*4-16     @ cf1
         sub             r1,  r1,  #12
-        cmp             r3,  #32
-        ite             eq
-        moveq           r6,  #256/32
-        movne           r6,  #256/64
-NOVFP   vldr            s0,  [sp, #16]          @ scale
         mov             lr,  #-16
 1:
         vmov.f32        q2,  #0.0               @ v0
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
index 5892a84..edabc29 100644
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -24,7 +24,6 @@
 POUT          .req    a1
 PIN           .req    a2
 PCOEF         .req    a3
-DECIFACTOR    .req    a4
 OLDFPSCR      .req    a4
 COUNTER       .req    ip
 
@@ -129,6 +128,15 @@ POST3         .req    s27
 .endm
 
 .macro dca_lfe_fir  decifactor
+function ff_dca_lfe_fir\decifactor\()_vfp, export=1
+NOVFP   vmov    s0, r3
+        fmrx    OLDFPSCR, FPSCR
+        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
+        fmxr    FPSCR, ip
+        vldr    IN0, [PIN, #-0*4]
+        vldr    IN1, [PIN, #-1*4]
+        vldr    IN2, [PIN, #-2*4]
+        vldr    IN3, [PIN, #-3*4]
  .if \decifactor == 32
   .set JMAX, 8
         vpush   {s16-s31}
@@ -165,32 +173,16 @@ POST3         .req    s27
  .endif
         fmxr    FPSCR, OLDFPSCR
         bx      lr
+endfunc
 .endm
 
-
-/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- *                         int decifactor, float scale)
- */
-function ff_dca_lfe_fir_vfp, export=1
-        teq     DECIFACTOR, #32
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-NOVFP   vldr    s0, [sp]
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
-        beq     32f
-64:     dca_lfe_fir  64
+        dca_lfe_fir  64
  .ltorg
-32:     dca_lfe_fir  32
-endfunc
+        dca_lfe_fir  32
 
         .unreq  POUT
         .unreq  PIN
         .unreq  PCOEF
-        .unreq  DECIFACTOR
         .unreq  OLDFPSCR
         .unreq  COUNTER
 
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 6ffb040..723ed19 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -957,23 +957,23 @@ static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
      * samples_out: An array holding interpolated samples
      */
 
-    int decifactor;
+    int idx;
     const float *prCoeff;
     int deciindex;
 
     /* Select decimation filter */
     if (decimation_select == 1) {
-        decifactor = 64;
+        idx = 1;
         prCoeff = lfe_fir_128;
     } else {
-        decifactor = 32;
+        idx = 0;
         prCoeff = lfe_fir_64;
     }
     /* Interpolation */
     for (deciindex = 0; deciindex < num_deci_sample; deciindex++) {
-        s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor, scale);
+        s->dcadsp.lfe_fir[idx](samples_out, samples_in, prCoeff, scale);
         samples_in++;
-        samples_out += 2 * decifactor;
+        samples_out += 2 * 32 * (1 + idx);
     }
 }
 
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 148f6dd..8d242c5 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -32,8 +32,9 @@ static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
         dst[i] = src[i] * fscale;
 }
 
-static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
-                          int decifactor, float scale)
+static inline void
+dca_lfe_fir(float *out, const float *in, const float *coefs,
+            int decifactor, float scale)
 {
     float *out2 = out + decifactor;
     const float *cf0 = coefs;
@@ -82,9 +83,22 @@ static void dca_qmf_32_subbands(float samples_in[32][8], int sb_act,
     }
 }
 
+static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs,
+                           float scale)
+{
+    dca_lfe_fir(out, in, coefs, 32, scale);
+}
+
+static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs,
+                           float scale)
+{
+    dca_lfe_fir(out, in, coefs, 64, scale);
+}
+
 av_cold void ff_dcadsp_init(DCADSPContext *s)
 {
-    s->lfe_fir = dca_lfe_fir_c;
+    s->lfe_fir[0] = dca_lfe_fir0_c;
+    s->lfe_fir[1] = dca_lfe_fir1_c;
     s->qmf_32_subbands = dca_qmf_32_subbands;
     s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
     if (ARCH_ARM) ff_dcadsp_init_arm(s);
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index e2ad09a..3e04426 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -23,8 +23,8 @@
 #include "synth_filter.h"
 
 typedef struct DCADSPContext {
-    void (*lfe_fir)(float *out, const float *in, const float *coefs,
-                    int decifactor, float scale);
+    void (*lfe_fir[2])(float *out, const float *in, const float *coefs,
+                       float scale);
     void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
                             SynthFilterContext *synth, FFTContext *imdct,
                             float synth_buf_ptr[512],