[FFmpeg-cvslog] dcadsp: split lfe_dir cases
Christophe Gisquet
git at videolan.org
Sat Feb 8 02:54:13 CET 2014
ffmpeg | branch: master | Christophe Gisquet <christophe.gisquet at gmail.com> | Wed Feb 5 23:40:52 2014 +0000| [5fdbfcb5b793f5849c496214668094a8ec99fa07] | committer: Janne Grunau
dcadsp: split lfe_dir cases
The x86 runs short on registers because numerous elements are not static.
In addition, splitting them allows more optimized code, at least for x86.
Arm asm changes by Janne Grunau.
Signed-off-by: Janne Grunau <janne-libav at jannau.net>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=5fdbfcb5b793f5849c496214668094a8ec99fa07
---
libavcodec/arm/dcadsp_init_arm.c | 23 ++++++++++++++++-------
libavcodec/arm/dcadsp_neon.S | 18 ++++++++++++------
libavcodec/arm/dcadsp_vfp.S | 32 ++++++++++++--------------------
libavcodec/dcadec.c | 10 +++++-----
libavcodec/dcadsp.c | 20 +++++++++++++++++---
libavcodec/dcadsp.h | 4 ++--
6 files changed, 64 insertions(+), 43 deletions(-)
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index d49a176..2ea1289 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -24,16 +24,22 @@
#include "libavutil/attributes.h"
#include "libavcodec/dcadsp.h"
-void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- int decifactor, float scale);
+void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs,
+ float scale);
+void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs,
+ float scale);
+
+void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs,
+ float scale);
+void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs,
+ float scale);
+
void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
SynthFilterContext *synth, FFTContext *imdct,
float synth_buf_ptr[512],
int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out,
float raXin[32], float scale);
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
- int decifactor, float scale);
void ff_synth_filter_float_vfp(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
@@ -52,11 +58,14 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
- s->lfe_fir = ff_dca_lfe_fir_vfp;
+ s->lfe_fir[0] = ff_dca_lfe_fir32_vfp;
+ s->lfe_fir[1] = ff_dca_lfe_fir64_vfp;
s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
}
- if (have_neon(cpu_flags))
- s->lfe_fir = ff_dca_lfe_fir_neon;
+ if (have_neon(cpu_flags)) {
+ s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
+ s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
+ }
}
av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index fe3aae8..c798fea 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -20,17 +20,23 @@
#include "libavutil/arm/asm.S"
-function ff_dca_lfe_fir_neon, export=1
+function ff_dca_lfe_fir0_neon, export=1
push {r4-r6,lr}
+NOVFP vmov s0, r3 @ scale
+ mov r3, #32 @ decifactor
+ mov r6, #256/32
+ b dca_lfe_fir
+endfunc
+function ff_dca_lfe_fir1_neon, export=1
+ push {r4-r6,lr}
+NOVFP vmov s0, r3 @ scale
+ mov r3, #64 @ decifactor
+ mov r6, #256/64
+dca_lfe_fir:
add r4, r0, r3, lsl #2 @ out2
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
- cmp r3, #32
- ite eq
- moveq r6, #256/32
- movne r6, #256/64
-NOVFP vldr s0, [sp, #16] @ scale
mov lr, #-16
1:
vmov.f32 q2, #0.0 @ v0
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
index 5892a84..edabc29 100644
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -24,7 +24,6 @@
POUT .req a1
PIN .req a2
PCOEF .req a3
-DECIFACTOR .req a4
OLDFPSCR .req a4
COUNTER .req ip
@@ -129,6 +128,15 @@ POST3 .req s27
.endm
.macro dca_lfe_fir decifactor
+function ff_dca_lfe_fir\decifactor\()_vfp, export=1
+NOVFP vmov s0, r3
+ fmrx OLDFPSCR, FPSCR
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, ip
+ vldr IN0, [PIN, #-0*4]
+ vldr IN1, [PIN, #-1*4]
+ vldr IN2, [PIN, #-2*4]
+ vldr IN3, [PIN, #-3*4]
.if \decifactor == 32
.set JMAX, 8
vpush {s16-s31}
@@ -165,32 +173,16 @@ POST3 .req s27
.endif
fmxr FPSCR, OLDFPSCR
bx lr
+endfunc
.endm
-
-/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- * int decifactor, float scale)
- */
-function ff_dca_lfe_fir_vfp, export=1
- teq DECIFACTOR, #32
- fmrx OLDFPSCR, FPSCR
- ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, ip
-NOVFP vldr s0, [sp]
- vldr IN0, [PIN, #-0*4]
- vldr IN1, [PIN, #-1*4]
- vldr IN2, [PIN, #-2*4]
- vldr IN3, [PIN, #-3*4]
- beq 32f
-64: dca_lfe_fir 64
+ dca_lfe_fir 64
.ltorg
-32: dca_lfe_fir 32
-endfunc
+ dca_lfe_fir 32
.unreq POUT
.unreq PIN
.unreq PCOEF
- .unreq DECIFACTOR
.unreq OLDFPSCR
.unreq COUNTER
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 6ffb040..723ed19 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -957,23 +957,23 @@ static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
* samples_out: An array holding interpolated samples
*/
- int decifactor;
+ int idx;
const float *prCoeff;
int deciindex;
/* Select decimation filter */
if (decimation_select == 1) {
- decifactor = 64;
+ idx = 1;
prCoeff = lfe_fir_128;
} else {
- decifactor = 32;
+ idx = 0;
prCoeff = lfe_fir_64;
}
/* Interpolation */
for (deciindex = 0; deciindex < num_deci_sample; deciindex++) {
- s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor, scale);
+ s->dcadsp.lfe_fir[idx](samples_out, samples_in, prCoeff, scale);
samples_in++;
- samples_out += 2 * decifactor;
+ samples_out += 2 * 32 * (1 + idx);
}
}
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 148f6dd..8d242c5 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -32,8 +32,9 @@ static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
dst[i] = src[i] * fscale;
}
-static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
- int decifactor, float scale)
+static inline void
+dca_lfe_fir(float *out, const float *in, const float *coefs,
+ int decifactor, float scale)
{
float *out2 = out + decifactor;
const float *cf0 = coefs;
@@ -82,9 +83,22 @@ static void dca_qmf_32_subbands(float samples_in[32][8], int sb_act,
}
}
+static void dca_lfe_fir0_c(float *out, const float *in, const float *coefs,
+ float scale)
+{
+ dca_lfe_fir(out, in, coefs, 32, scale);
+}
+
+static void dca_lfe_fir1_c(float *out, const float *in, const float *coefs,
+ float scale)
+{
+ dca_lfe_fir(out, in, coefs, 64, scale);
+}
+
av_cold void ff_dcadsp_init(DCADSPContext *s)
{
- s->lfe_fir = dca_lfe_fir_c;
+ s->lfe_fir[0] = dca_lfe_fir0_c;
+ s->lfe_fir[1] = dca_lfe_fir1_c;
s->qmf_32_subbands = dca_qmf_32_subbands;
s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
if (ARCH_ARM) ff_dcadsp_init_arm(s);
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index e2ad09a..3e04426 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -23,8 +23,8 @@
#include "synth_filter.h"
typedef struct DCADSPContext {
- void (*lfe_fir)(float *out, const float *in, const float *coefs,
- int decifactor, float scale);
+ void (*lfe_fir[2])(float *out, const float *in, const float *coefs,
+ float scale);
void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
SynthFilterContext *synth, FFTContext *imdct,
float synth_buf_ptr[512],
More information about the ffmpeg-cvslog
mailing list