[FFmpeg-devel] [PATCH v2 02/16] avcodec/dca: remove unused assembly
foo86
foobaz86 at gmail.com
Thu Jan 21 19:44:55 CET 2016
This removes assembly implementations of DCA DSP functions that are not
going to be reused.
---
libavcodec/aarch64/Makefile | 3 +-
libavcodec/aarch64/dcadsp_init.c | 15 +-
libavcodec/aarch64/dcadsp_neon.S | 109 ---------
libavcodec/arm/Makefile | 6 +-
libavcodec/arm/dca.h | 1 -
libavcodec/arm/dcadsp_init_arm.c | 30 +--
libavcodec/arm/dcadsp_neon.S | 64 ------
libavcodec/arm/dcadsp_vfp.S | 476 ---------------------------------------
libavcodec/x86/dcadsp.asm | 99 --------
libavcodec/x86/dcadsp_init.c | 21 +-
10 files changed, 6 insertions(+), 818 deletions(-)
delete mode 100644 libavcodec/aarch64/dcadsp_neon.S
delete mode 100644 libavcodec/arm/dcadsp_neon.S
delete mode 100644 libavcodec/arm/dcadsp_vfp.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 022ed84..998ffa2 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -17,8 +17,7 @@ OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \
- aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c
index 78642a5..c8d3c77 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -23,7 +23,7 @@
#include "libavutil/aarch64/cpu.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
+#include "libavcodec/synth_filter.h"
#include "libavcodec/fft.h"
#include "asm-offsets.h"
@@ -32,25 +32,12 @@
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
#endif
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
- s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
- }
-}
-
av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
deleted file mode 100644
index 0426dc6..0000000
--- a/libavcodec/aarch64/dcadsp_neon.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
- * Copyright (c) 2015 Janne Grunau <janne-libav at jannau.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
- mov x3, #32 // decifactor
- sub x1, x1, #7*4
- add x4, x0, #2*32*4 - 16 // out2
- mov x7, #-16
-
- ld1 {v0.4s,v1.4s}, [x1]
- // reverse [-num_coeffs + 1, 0]
- ext v3.16b, v0.16b, v0.16b, #8
- ext v2.16b, v1.16b, v1.16b, #8
- rev64 v3.4s, v3.4s
- rev64 v2.4s, v2.4s
-1:
- ld1 {v4.4s,v5.4s}, [x2], #32
- ld1 {v6.4s,v7.4s}, [x2], #32
- subs x3, x3, #4
- fmul v16.4s, v2.4s, v4.4s
- fmul v23.4s, v0.4s, v4.4s
- fmul v17.4s, v2.4s, v6.4s
- fmul v22.4s, v0.4s, v6.4s
-
- fmla v16.4s, v3.4s, v5.4s
- fmla v23.4s, v1.4s, v5.4s
- ld1 {v4.4s,v5.4s}, [x2], #32
- fmla v17.4s, v3.4s, v7.4s
- fmla v22.4s, v1.4s, v7.4s
- ld1 {v6.4s,v7.4s}, [x2], #32
- fmul v18.4s, v2.4s, v4.4s
- fmul v21.4s, v0.4s, v4.4s
- fmul v19.4s, v2.4s, v6.4s
- fmul v20.4s, v0.4s, v6.4s
-
- fmla v18.4s, v3.4s, v5.4s
- fmla v21.4s, v1.4s, v5.4s
- fmla v19.4s, v3.4s, v7.4s
- fmla v20.4s, v1.4s, v7.4s
-
- faddp v16.4s, v16.4s, v17.4s
- faddp v18.4s, v18.4s, v19.4s
- faddp v20.4s, v20.4s, v21.4s
- faddp v22.4s, v22.4s, v23.4s
- faddp v16.4s, v16.4s, v18.4s
- faddp v20.4s, v20.4s, v22.4s
-
- st1 {v16.4s}, [x0], #16
- st1 {v20.4s}, [x4], x7
- b.gt 1b
-
- ret
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
- mov x3, #64 // decifactor
- sub x1, x1, #3*4
- add x4, x0, #2*64*4 - 16 // out2
- mov x7, #-16
-
- ld1 {v0.4s}, [x1]
- // reverse [-num_coeffs + 1, 0]
- ext v1.16b, v0.16b, v0.16b, #8
- rev64 v1.4s, v1.4s
-
-1:
- ld1 {v4.4s,v5.4s}, [x2], #32
- ld1 {v6.4s,v7.4s}, [x2], #32
- subs x3, x3, #4
- fmul v16.4s, v1.4s, v4.4s
- fmul v23.4s, v0.4s, v4.4s
- fmul v17.4s, v1.4s, v5.4s
- fmul v22.4s, v0.4s, v5.4s
- fmul v18.4s, v1.4s, v6.4s
- fmul v21.4s, v0.4s, v6.4s
- fmul v19.4s, v1.4s, v7.4s
- fmul v20.4s, v0.4s, v7.4s
- faddp v16.4s, v16.4s, v17.4s
- faddp v18.4s, v18.4s, v19.4s
- faddp v20.4s, v20.4s, v21.4s
- faddp v22.4s, v22.4s, v23.4s
- faddp v16.4s, v16.4s, v18.4s
- faddp v20.4s, v20.4s, v22.4s
- st1 {v16.4s}, [x0], #16
- st1 {v20.4s}, [x4], x7
- b.gt 1b
-
- ret
-endfunc
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index cdd35b0..5089b7f 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -87,8 +87,7 @@ VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
# decoders/encoders
-VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
- arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
# NEON optimizations
@@ -127,8 +126,7 @@ NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
- arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
arm/hevcdsp_deblock_neon.o \
arm/hevcdsp_idct_neon.o \
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 6e87111..ae4b730 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -24,7 +24,6 @@
#include <stdint.h>
#include "config.h"
-#include "libavcodec/dcadsp.h"
#include "libavcodec/mathops.h"
#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 0f2e4c4..3dae5b9 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -22,20 +22,7 @@
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
-void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs);
-
-void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- SynthFilterContext *synth, FFTContext *imdct,
- float synth_buf_ptr[512],
- int *synth_buf_offset, float synth_buf2[32],
- const float window[512], float *samples_out,
- float raXin[32], float scale);
+#include "libavcodec/synth_filter.h"
void ff_synth_filter_float_vfp(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
@@ -49,21 +36,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
float out[32], const float in[32],
float scale);
-av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp_vm(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir32_vfp;
- s->lfe_fir[1] = ff_dca_lfe_fir64_vfp;
- s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
- }
- if (have_neon(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
- s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
- }
-}
-
av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
deleted file mode 100644
index 101fee0..0000000
--- a/libavcodec/arm/dcadsp_neon.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
- push {r4-r6,lr}
- mov r3, #32 @ decifactor
- mov r6, #256/32
- b dca_lfe_fir
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
- push {r4-r6,lr}
- mov r3, #64 @ decifactor
- mov r6, #256/64
-dca_lfe_fir:
- add r4, r0, r3, lsl #2 @ out2
- add r5, r2, #256*4-16 @ cf1
- sub r1, r1, #12
- mov lr, #-16
-1:
- vmov.f32 q2, #0.0 @ v0
- vmov.f32 q3, #0.0 @ v1
- mov r12, r6
-2:
- vld1.32 {q8}, [r2,:128]! @ cf0
- vld1.32 {q9}, [r5,:128], lr @ cf1
- vld1.32 {q1}, [r1], lr @ in
- subs r12, r12, #4
- vrev64.32 q10, q8
- vmla.f32 q3, q1, q9
- vmla.f32 d4, d2, d21
- vmla.f32 d5, d3, d20
- bne 2b
-
- add r1, r1, r6, lsl #2
- subs r3, r3, #1
- vadd.f32 d4, d4, d5
- vadd.f32 d6, d6, d7
- vpadd.f32 d5, d4, d6
- vst1.32 {d5[0]}, [r0,:32]!
- vst1.32 {d5[1]}, [r4,:32]!
- bne 1b
-
- pop {r4-r6,pc}
-endfunc
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
deleted file mode 100644
index 2e09f0e..0000000
--- a/libavcodec/arm/dcadsp_vfp.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison at riscosopen.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-POUT .req a1
-PIN .req a2
-PCOEF .req a3
-OLDFPSCR .req a4
-COUNTER .req ip
-
-IN0 .req s4
-IN1 .req s5
-IN2 .req s6
-IN3 .req s7
-IN4 .req s0
-IN5 .req s1
-IN6 .req s2
-IN7 .req s3
-COEF0 .req s8 @ coefficient elements
-COEF1 .req s9
-COEF2 .req s10
-COEF3 .req s11
-COEF4 .req s12
-COEF5 .req s13
-COEF6 .req s14
-COEF7 .req s15
-ACCUM0 .req s16 @ double-buffered multiply-accumulate results
-ACCUM4 .req s20
-POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
-POST1 .req s25
-POST2 .req s26
-POST3 .req s27
-
-
-.macro inner_loop decifactor, dir, tail, head
- .ifc "\dir","up"
- .set X, 0
- .set Y, 4
- .else
- .set X, 4*JMAX*4 - 4
- .set Y, -4
- .endif
- .ifnc "\head",""
- vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
- .endif
- .ifnc "\tail",""
- vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
- .endif
- .ifnc "\head",""
- vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
- .endif
- .ifnc "\head",""
- vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
- .ifc "\tail",""
- vmul.f ACCUM4, COEF4, IN1 @ vector operation
- .endif
- vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
- .ifnc "\tail",""
- vmul.f ACCUM4, COEF4, IN1 @ vector operation
- .endif
- vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
- .endif
- .ifnc "\tail",""
- vstmia POUT!, {POST0-POST3}
- .endif
- .ifnc "\head",""
- vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
- vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
- .if \decifactor == 32
- vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
- vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
- vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
- vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
- vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
- vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
- .endif
- .endif
-.endm
-
-.macro dca_lfe_fir decifactor
-function ff_dca_lfe_fir\decifactor\()_vfp, export=1
- fmrx OLDFPSCR, FPSCR
- ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, ip
- vldr IN0, [PIN, #-0*4]
- vldr IN1, [PIN, #-1*4]
- vldr IN2, [PIN, #-2*4]
- vldr IN3, [PIN, #-3*4]
- .if \decifactor == 32
- .set JMAX, 8
- vpush {s16-s31}
- vldr IN4, [PIN, #-4*4]
- vldr IN5, [PIN, #-5*4]
- vldr IN6, [PIN, #-6*4]
- vldr IN7, [PIN, #-7*4]
- .else
- .set JMAX, 4
- vpush {s16-s27}
- .endif
-
- mov COUNTER, #\decifactor/4 - 1
- inner_loop \decifactor, up,, head
-1: add PCOEF, PCOEF, #4*JMAX*4
- subs COUNTER, COUNTER, #1
- inner_loop \decifactor, up, tail, head
- bne 1b
- inner_loop \decifactor, up, tail
-
- mov COUNTER, #\decifactor/4 - 1
- inner_loop \decifactor, down,, head
-1: sub PCOEF, PCOEF, #4*JMAX*4
- subs COUNTER, COUNTER, #1
- inner_loop \decifactor, down, tail, head
- bne 1b
- inner_loop \decifactor, down, tail
-
- .if \decifactor == 32
- vpop {s16-s31}
- .else
- vpop {s16-s27}
- .endif
- fmxr FPSCR, OLDFPSCR
- bx lr
-endfunc
-.endm
-
- dca_lfe_fir 64
- .ltorg
- dca_lfe_fir 32
-
- .unreq POUT
- .unreq PIN
- .unreq PCOEF
- .unreq OLDFPSCR
- .unreq COUNTER
-
- .unreq IN0
- .unreq IN1
- .unreq IN2
- .unreq IN3
- .unreq IN4
- .unreq IN5
- .unreq IN6
- .unreq IN7
- .unreq COEF0
- .unreq COEF1
- .unreq COEF2
- .unreq COEF3
- .unreq COEF4
- .unreq COEF5
- .unreq COEF6
- .unreq COEF7
- .unreq ACCUM0
- .unreq ACCUM4
- .unreq POST0
- .unreq POST1
- .unreq POST2
- .unreq POST3
-
-
-IN .req a1
-SBACT .req a2
-OLDFPSCR .req a3
-IMDCT .req a4
-WINDOW .req v1
-OUT .req v2
-BUF .req v3
-SCALEINT .req v4 @ only used in softfp case
-COUNT .req v5
-
-SCALE .req s0
-
-/* Stack layout differs in softfp and hardfp cases:
- *
- * hardfp
- * fp -> 6 arg words saved by caller
- * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
- * s16-s23 on entry
- * align 16
- * buf -> 8*32*4 bytes buffer
- * s0 on entry
- * sp -> 3 arg words for callee
- *
- * softfp
- * fp -> 7 arg words saved by caller
- * a4,v1-v5,fp,lr on entry
- * s16-s23 on entry
- * align 16
- * buf -> 8*32*4 bytes buffer
- * sp -> 4 arg words for callee
- */
-
-/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- * SynthFilterContext *synth, FFTContext *imdct,
- * float (*synth_buf_ptr)[512],
- * int *synth_buf_offset, float (*synth_buf2)[32],
- * const float (*window)[512], float *samples_out,
- * float (*raXin)[32], float scale);
- */
-function ff_dca_qmf_32_subbands_vfp, export=1
-VFP push {a3-a4,v1-v3,v5,fp,lr}
-NOVFP push {a4,v1-v5,fp,lr}
- add fp, sp, #8*4
- vpush {s16-s23}
- @ The buffer pointed at by raXin isn't big enough for us to do a
- @ complete matrix transposition as we want to, so allocate an
- @ alternative buffer from the stack. Align to 4 words for speed.
- sub BUF, sp, #8*32*4
- bic BUF, BUF, #15
- mov sp, BUF
- ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
- fmrx OLDFPSCR, FPSCR
- fmxr FPSCR, lr
- @ COUNT is used to count down 2 things at once:
- @ bits 0-4 are the number of word pairs remaining in the output row
- @ bits 5-31 are the number of words to copy (with possible negation)
- @ from the source matrix before we start zeroing the remainder
- mov COUNT, #(-4 << 5) + 16
- adds COUNT, COUNT, SBACT, lsl #5
- bmi 2f
-1:
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, [IN, #(1*8+0)*4]
- vldr s11, [IN, #(1*8+1)*4]
- vldr s13, [IN, #(1*8+2)*4]
- vldr s15, [IN, #(1*8+3)*4]
- vneg.f s16, s16
- vldr s17, [IN, #(1*8+4)*4]
- vldr s19, [IN, #(1*8+5)*4]
- vldr s21, [IN, #(1*8+6)*4]
- vldr s23, [IN, #(1*8+7)*4]
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- vldr s9, [IN, #(3*8+0)*4]
- vldr s11, [IN, #(3*8+1)*4]
- vldr s13, [IN, #(3*8+2)*4]
- vldr s15, [IN, #(3*8+3)*4]
- vldr s17, [IN, #(3*8+4)*4]
- vldr s19, [IN, #(3*8+5)*4]
- vldr s21, [IN, #(3*8+6)*4]
- vldr s23, [IN, #(3*8+7)*4]
- vneg.f s9, s9
- vldr s8, [IN, #(2*8+0)*4]
- vldr s10, [IN, #(2*8+1)*4]
- vldr s12, [IN, #(2*8+2)*4]
- vldr s14, [IN, #(2*8+3)*4]
- vneg.f s17, s17
- vldr s16, [IN, #(2*8+4)*4]
- vldr s18, [IN, #(2*8+5)*4]
- vldr s20, [IN, #(2*8+6)*4]
- vldr s22, [IN, #(2*8+7)*4]
- vstr d4, [BUF, #(0*32+2)*4]
- vstr d5, [BUF, #(1*32+2)*4]
- vstr d6, [BUF, #(2*32+2)*4]
- vstr d7, [BUF, #(3*32+2)*4]
- vstr d8, [BUF, #(4*32+2)*4]
- vstr d9, [BUF, #(5*32+2)*4]
- vstr d10, [BUF, #(6*32+2)*4]
- vstr d11, [BUF, #(7*32+2)*4]
- add IN, IN, #4*8*4
- add BUF, BUF, #4*4
- subs COUNT, COUNT, #(4 << 5) + 2
- bpl 1b
-2: @ Now deal with trailing < 4 samples
- adds COUNT, COUNT, #3 << 5
- bmi 4f @ sb_act was a multiple of 4
- bics lr, COUNT, #0x1F
- bne 3f
- @ sb_act was n*4+1
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, zero
- vldr s11, zero
- vldr s13, zero
- vldr s15, zero
- vneg.f s16, s16
- vldr s17, zero
- vldr s19, zero
- vldr s21, zero
- vldr s23, zero
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #1
- b 4f
-3: @ sb_act was n*4+2 or n*4+3, so do the first 2
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, [IN, #(1*8+0)*4]
- vldr s11, [IN, #(1*8+1)*4]
- vldr s13, [IN, #(1*8+2)*4]
- vldr s15, [IN, #(1*8+3)*4]
- vneg.f s16, s16
- vldr s17, [IN, #(1*8+4)*4]
- vldr s19, [IN, #(1*8+5)*4]
- vldr s21, [IN, #(1*8+6)*4]
- vldr s23, [IN, #(1*8+7)*4]
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #(2 << 5) + 1
- bics lr, COUNT, #0x1F
- bne 4f
- @ sb_act was n*4+3
- vldr s8, [IN, #(2*8+0)*4]
- vldr s10, [IN, #(2*8+1)*4]
- vldr s12, [IN, #(2*8+2)*4]
- vldr s14, [IN, #(2*8+3)*4]
- vldr s16, [IN, #(2*8+4)*4]
- vldr s18, [IN, #(2*8+5)*4]
- vldr s20, [IN, #(2*8+6)*4]
- vldr s22, [IN, #(2*8+7)*4]
- vldr s9, zero
- vldr s11, zero
- vldr s13, zero
- vldr s15, zero
- vldr s17, zero
- vldr s19, zero
- vldr s21, zero
- vldr s23, zero
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #1
-4: @ Now fill the remainder with 0
- vldr s8, zero
- vldr s9, zero
- ands COUNT, COUNT, #0x1F
- beq 6f
-5: vstr d4, [BUF, #(0*32+0)*4]
- vstr d4, [BUF, #(1*32+0)*4]
- vstr d4, [BUF, #(2*32+0)*4]
- vstr d4, [BUF, #(3*32+0)*4]
- vstr d4, [BUF, #(4*32+0)*4]
- vstr d4, [BUF, #(5*32+0)*4]
- vstr d4, [BUF, #(6*32+0)*4]
- vstr d4, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- subs COUNT, COUNT, #1
- bne 5b
-6:
- fmxr FPSCR, OLDFPSCR
- ldr WINDOW, [fp, #3*4]
- ldr OUT, [fp, #4*4]
- sub BUF, BUF, #32*4
-NOVFP ldr SCALEINT, [fp, #6*4]
- mov COUNT, #8
-VFP vpush {SCALE}
-VFP sub sp, sp, #3*4
-NOVFP sub sp, sp, #4*4
-7:
-VFP ldr a1, [fp, #-7*4] @ imdct
-NOVFP ldr a1, [fp, #-8*4]
- ldmia fp, {a2-a4}
-VFP stmia sp, {WINDOW, OUT, BUF}
-NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
-VFP vldr SCALE, [sp, #3*4]
- bl X(ff_synth_filter_float_vfp)
- add OUT, OUT, #32*4
- add BUF, BUF, #32*4
- subs COUNT, COUNT, #1
- bne 7b
-
-A sub sp, fp, #(8+8)*4
-T sub fp, fp, #(8+8)*4
-T mov sp, fp
- vpop {s16-s23}
-VFP pop {a3-a4,v1-v3,v5,fp,pc}
-NOVFP pop {a4,v1-v5,fp,pc}
-endfunc
-
- .unreq IN
- .unreq SBACT
- .unreq OLDFPSCR
- .unreq IMDCT
- .unreq WINDOW
- .unreq OUT
- .unreq BUF
- .unreq SCALEINT
- .unreq COUNT
-
- .unreq SCALE
-
- .align 2
-zero: .word 0
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 502b70a..bc1a48f 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -21,107 +21,8 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-pf_inv16: times 4 dd 0x3D800000 ; 1/16
-
SECTION .text
-; %1=v0/v1 %2=in1 %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va m1
-%define vb m2
-%if %1
-%define OFFSET 0
-%else
-%define OFFSET NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
- mova va, [cf0q + OFFSET]
- mova vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
- mova m4, [cf0q + OFFSET + mmsize]
- mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
- mulps va, %2
- mulps vb, %2
-%if %0 == 3
-%if cpuflag(fma3)
- fmaddps va, m4, %3, va
- fmaddps vb, m0, %3, vb
-%else
- mulps m4, %3
- mulps m0, %3
- addps va, m4
- addps vb, m0
-%endif
-%endif
- ; va = va1 va2 va3 va4
- ; vb = vb1 vb2 vb3 vb4
-%if %1
- SWAP va, vb
-%endif
- mova m4, va
- unpcklps va, vb ; va3 vb3 va4 vb4
- unpckhps m4, vb ; va1 vb1 va2 vb2
- addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
- movhlps vb, m4 ; va1+3 vb1+3
- addps vb, m4 ; va0..4 vb0..4
- movlps [outq + count], vb
-%if %1
- sub cf0q, 8*NUM_COEF
-%endif
- add count, 8
- jl .loop%1
-%endmacro
-
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1 m3
-%define IN2 m5
-%define count inq
-%define NUM_COEF 4*(2-%1)
-%define NUM_OUT 32*(%1+1)
-
- movu IN1, [inq + 4 - 1*mmsize]
- shufps IN1, IN1, q0123
-%if %1 == 0
- movu IN2, [inq + 4 - 2*mmsize]
- shufps IN2, IN2, q0123
-%endif
-
- mov count, -4*NUM_OUT
- add cf0q, 4*NUM_COEF*NUM_OUT
- add outq, 4*NUM_OUT
- ; compute v0 first
-%if %1 == 0
- FIR_LOOP 0, IN1, IN2
-%else
- FIR_LOOP 0, IN1
-%endif
- shufps IN1, IN1, q0123
- mov count, -4*NUM_OUT
- ; cf1 already correctly positioned
- add outq, 4*NUM_OUT ; outq now at out2
- sub cf0q, 8*NUM_COEF
-%if %1 == 0
- shufps IN2, IN2, q0123
- FIR_LOOP 1, IN2, IN1
-%else
- FIR_LOOP 1, IN1
-%endif
- RET
-%endmacro
-
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
-%if HAVE_FMA3_EXTERNAL
-INIT_XMM fma3
-DCA_LFE_FIR 0
-%endif
-
%macro SETZERO 1
%if cpuflag(sse2) && notcpuflag(avx)
pxor %1, %1
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 1321dda..0649ea2 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -21,26 +21,7 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs);
-
-av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_SSE(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
- s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
- }
-
- if (EXTERNAL_FMA3(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_fma3;
- }
-}
-
+#include "libavcodec/synth_filter.h"
#define SYNTH_FILTER_FUNC(opt) \
void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
--
2.1.4
More information about the ffmpeg-devel
mailing list