[FFmpeg-devel] [PATCH v4] SH4 mpegaudio decoder optimizations
Guennadi Liakhovetski
g.liakhovetski
Tue Feb 1 18:29:38 CET 2011
Hi all
This is version 4 of the following patch:
This patch implements several mpegaudio optimizations for SH4 FPU-enabled
SoCs. Verified to provide more than 45% acceleration, when decoding a
128kbps stereo mp3 audio file to /dev/null.
---
Yes, I know, the .S file is all indented with "evil" TABs... Please, let's
review it this way, I'll convert it to spaces once accepted. Would have
been too much trouble for me to develop with spaces...
v4: switched to a out-of-line .S assembly implementation
v3, v2, v1: were inline, changelog irrelevant
Thanks to all who commented!
diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
index d9a1fb7..33b1692 100644
--- a/libavcodec/mpc.c
+++ b/libavcodec/mpc.c
@@ -51,7 +51,7 @@ static void mpc_synth(MPCContext *c, int16_t *out, int channels)
for(ch = 0; ch < channels; ch++){
samples_ptr = samples + ch;
for(i = 0; i < SAMPLES_PER_BAND; i++) {
- ff_mpa_synth_filter(c->synth_buf[ch], &(c->synth_buf_offset[ch]),
+ ff_mpa_synth_filter(NULL, c->synth_buf[ch], &(c->synth_buf_offset[ch]),
ff_mpa_synth_window, &dither_state,
samples_ptr, channels,
c->sb_samples[ch][i]);
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index 97c7855..f1a05e8 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -175,7 +175,8 @@ int ff_mpa_l2_select_table(int bitrate, int nb_channels, int freq, int lsf);
int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate);
extern MPA_INT ff_mpa_synth_window[];
void ff_mpa_synth_init(MPA_INT *window);
-void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
+void ff_mpa_synth_filter(MPADecodeContext *s,
+ MPA_INT *synth_buf_ptr, int *synth_buf_offset,
MPA_INT *window, int *dither_state,
OUT_INT *samples, int incr,
INTFLOAT sb_samples[SBLIMIT]);
@@ -189,6 +190,7 @@ void ff_mpa_synth_filter_float(MPADecodeContext *s,
void ff_mpegaudiodec_init_mmx(MPADecodeContext *s);
void ff_mpegaudiodec_init_altivec(MPADecodeContext *s);
+void ff_mpegaudiodec_init_sh4(MPADecodeContext *s);
/* fast header check for resync */
static inline int ff_mpa_check_header(uint32_t header){
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index 769be89..27ca635 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -329,6 +329,10 @@ static av_cold int decode_init(AVCodecContext * avctx)
#endif
if (HAVE_ALTIVEC && CONFIG_FLOAT) ff_mpegaudiodec_init_altivec(s);
+#if ARCH_SH4 && !CONFIG_FLOAT
+ ff_mpegaudiodec_init_sh4(s);
+#endif
+
avctx->sample_fmt= OUT_FMT;
s->error_recognition= avctx->error_recognition;
@@ -702,7 +706,8 @@ static void apply_window_mp3_c(MPA_INT *synth_buf, MPA_INT *window,
32 samples. */
/* XXX: optimize by avoiding ring buffer usage */
#if !CONFIG_FLOAT
-void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
+void ff_mpa_synth_filter(MPADecodeContext *s,
+ MPA_INT *synth_buf_ptr, int *synth_buf_offset,
MPA_INT *window, int *dither_state,
OUT_INT *samples, int incr,
INTFLOAT sb_samples[SBLIMIT])
@@ -728,7 +733,10 @@ void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
dct32(synth_buf, sb_samples);
#endif
- apply_window_mp3_c(synth_buf, window, dither_state, samples, incr);
+ if (s)
+ s->apply_window_mp3(synth_buf, window, dither_state, samples, incr);
+ else
+ apply_window_mp3_c(synth_buf, window, dither_state, samples, incr);
offset = (offset - 32) & 511;
*synth_buf_offset = offset;
@@ -2013,9 +2021,7 @@ static int mp_decode_frame(MPADecodeContext *s,
samples_ptr = samples + ch;
for(i=0;i<nb_frames;i++) {
RENAME(ff_mpa_synth_filter)(
-#if CONFIG_FLOAT
s,
-#endif
s->synth_buf[ch], &(s->synth_buf_offset[ch]),
RENAME(ff_mpa_synth_window), &s->dither_state,
samples_ptr, s->nb_channels,
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index efcf6b5..94e3b07 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -1616,7 +1616,7 @@ static void qdm2_synthesis_filter (QDM2Context *q, int index)
OUT_INT *samples_ptr = samples + ch;
for (i = 0; i < 8; i++) {
- ff_mpa_synth_filter(q->synth_buf[ch], &(q->synth_buf_offset[ch]),
+ ff_mpa_synth_filter(NULL, q->synth_buf[ch], &(q->synth_buf_offset[ch]),
ff_mpa_synth_window, &dither_state,
samples_ptr, q->nb_channels,
q->sb_samples[ch][(8 * index) + i]);
diff --git a/libavcodec/sh4/Makefile b/libavcodec/sh4/Makefile
index 142cba4..26a1e83 100644
--- a/libavcodec/sh4/Makefile
+++ b/libavcodec/sh4/Makefile
@@ -1,3 +1,4 @@
OBJS += sh4/dsputil_align.o \
sh4/dsputil_sh4.o \
sh4/idct_sh4.o \
+ sh4/mpegaudiodec_sh4.o
diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
index ec06e24..37efdf0 100644
--- a/libavcodec/sh4/dsputil_sh4.c
+++ b/libavcodec/sh4/dsputil_sh4.c
@@ -20,8 +20,11 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#undef CONFIG_FLOAT
+
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegaudio.h"
#include "dsputil_sh4.h"
#include "sh4.h"
@@ -102,3 +105,32 @@ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
c->idct_permutation_type= FF_NO_IDCT_PERM;
}
}
+
+void mp3_win_loop_sh4(int64_t *sum, MPA_INT *synth_buf, MPA_INT *w,
+ OUT_INT *samples, int incr);
+int round_sample_sh4(int64_t *sum);
+void sum8_macs_sh4(int64_t *sum, int32_t *p, int32_t *w);
+
+static void apply_window_mp3_sh4(MPA_INT *synth_buf, MPA_INT *window,
+ int *dither_state, OUT_INT *samples, int incr)
+{
+ int64_t sum, sum2 = 0;
+
+ /* copy to avoid wrap */
+ memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
+
+ sum = *dither_state;
+ sum8_macs_sh4(&sum, synth_buf + 16, window);
+ sum8_macs_sh4(&sum2, synth_buf + 48, window + 32);
+ sum -= sum2;
+ *samples = round_sample_sh4(&sum);
+
+ mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr);
+
+ *dither_state = sum;
+}
+
+void ff_mpegaudiodec_init_sh4(MPADecodeContext *s)
+{
+ s->apply_window_mp3 = apply_window_mp3_sh4;
+}
diff --git a/libavcodec/sh4/mpegaudiodec_sh4.S b/libavcodec/sh4/mpegaudiodec_sh4.S
new file mode 100644
index 0000000..ea3b95c
--- /dev/null
+++ b/libavcodec/sh4/mpegaudiodec_sh4.S
@@ -0,0 +1,244 @@
+#define MAXSHORT (1 << 15) - 1
+#define MINSHORT -(1 << 15)
+
+#define FRAC_BITS 23 /* fractional bits for sb_samples and dct */
+#define WFRAC_BITS 16 /* fractional bits for window */
+
+#define OUT_MAX MAXSHORT
+#define OUT_MIN MINSHORT
+#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
+
+ .macro sh_macs, wp, pp, tmp
+ add \tmp, \wp
+ add \tmp, \pp
+ mac.l @\wp+, @\pp+
+ .endm
+
+ .macro sh_sum8_macs, sum, wp, pp, tmp
+ mov.l @\sum+, \tmp
+ lds \tmp, macl
+ mov.l @\sum+, \tmp
+ lds \tmp, mach
+ mov #126, \tmp
+ shll \tmp /* 63 * 4 */
+ mac.l @\wp+, @\pp+ /* 0 */
+ sh_macs \wp, \pp, \tmp /* 1 */
+ sh_macs \wp, \pp, \tmp /* 2 */
+ sh_macs \wp, \pp, \tmp /* 3 */
+ sh_macs \wp, \pp, \tmp /* 4 */
+ sh_macs \wp, \pp, \tmp /* 5 */
+ sh_macs \wp, \pp, \tmp /* 6 */
+ sh_macs \wp, \pp, \tmp /* 7 */
+ sts mach, \tmp
+ mov.l \tmp, @-\sum
+ sts macl, \tmp
+ mov.l \tmp, @-\sum
+ .endm
+
+ /* void sum8_macs(int64_t *sum, int32_t *p, int32_t *w) */
+
+ .globl sum8_macs_sh4
+ .balign 4
+ .type sum8_macs_sh4, @function
+sum8_macs_sh4:
+ sh_sum8_macs r4, r6, r5, r3
+ rts
+ nop
+ .size sum8_macs_sh4, .-sum8_macs_sh4
+
+ .macro round_sample, sum, tmp1, tmp2
+ mov.l @\sum, \tmp1
+ mov.l m23, \tmp2
+ and \tmp1, \tmp2 /* hi &= (1 << 24) - 1 */
+ mov.l \tmp2, @\sum
+ shlr16 \tmp1
+ shlr8 \tmp1 /* lo >>= 24 */
+ mov.l @(4, \sum), \tmp2
+ shll8 \tmp2 /* hi <<= 8 */
+ or \tmp2, \tmp1 /* lo |= hi */
+ mov #0, \tmp2
+ mov.l \tmp2, @(4, \sum)
+ /* av_clip */
+ mov.l max, \tmp2
+ cmp/ge \tmp2, \tmp1
+ bt 1f
+ mov.l min, \tmp2
+ cmp/gt \tmp2, \tmp1
+ bt 2f
+1: mov \tmp2, \tmp1
+2:
+ .endm
+
+ /* int round_sample_asm(int64_t *sum) */
+
+ .globl round_sample_sh4
+ .balign 4
+ .type round_sample_sh4, @function
+round_sample_sh4:
+ round_sample r4, r0, r1
+ rts
+ nop
+ .size round_sample_sh4, .-round_sample_sh4
+
+ .balign 4
+
+ /* void mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr) */
+ .globl mp3_win_loop_sh4
+ .balign 4
+ .type mp3_win_loop_sh4, @function
+mp3_win_loop_sh4:
+ mov r6, r0
+ add #120, r0 /* w2 = w + 30 (* 4) */
+ mov #60, r1 /* j = r1 - loop counter: 15 * 4 */
+
+ mov.l @r15, r2 /* r2 = incr */
+ shll r2 /* r2 *= 2 */
+
+ mov.l r8, @-r15
+ mov.l r9, @-r15
+ mov.l r10, @-r15
+ mov.l r11, @-r15
+ sts.l pr, @-r15
+
+ mov #30, r11
+ muls.w r2, r11
+ sts macl, r11
+ add r7, r11 /* samples2 = samples + 30 * incr (* 2) */
+
+4: sub r1, r5 /* p = synth_buf - j (* 4) */
+
+ /* We'll use stack for 64-bit variables:
+ * tmp = -sum2 */
+
+ /* SUM8P2_MACS_MLSS(sum, sum2, w, w2, p) */
+
+ mov r5, r8
+ mov r6, r9
+ sh_sum8_macs r4, r6, r5, r3 /* sum, window, pp, tmp */
+ mov r8, r5 /* restore p */
+ mov r9, r6 /* restore w */
+
+ mov #0, r10
+ mov.l r10, @-r15 /* sum2 = 0 */
+ mov.l r10, @-r15
+ mov r15, r10 /* r10 = &sum2 */
+
+ mov r0, r9
+ sh_sum8_macs r10, r0, r5, r3 /* sum2 is now at the top of the stack */
+ mov r9, r0 /* restore w2 */
+ mov r8, r5 /* restore p */
+
+ /* SUM8P2_MLSS_MLSS(sum, sum2, w + 32, w2 + 32, p) */
+
+ add r1, r5
+ add r1, r5 /* p += 2 * j (* 4) */
+
+ mov r5, r8 /* save r5 */
+ mov r6, r9 /* save r6 before incrementing */
+
+ add #64, r6
+ add #64, r6 /* w += 32 (* 4): 128 < 0 in 8-bit arith. */
+ mov #0, r10
+ mov.l r10, @-r15 /* int64_t tmp = 0 */
+ mov.l r10, @-r15
+
+ mov r15, r10 /* r10 = &tmp */
+
+ sh_sum8_macs r10, r6, r5, r3 /* tmp is now at the top of the stack */
+ mov r9, r6 /* restore w */
+ mov r8, r5 /* restore p */
+
+ clrt
+
+ /* sum -= tmp - 64-bit signed */
+ mov.l @r15+, r8 /* low 32-bits of the tmp-sum */
+ mov.l @r4, r10 /* low 32-bits of the sum */
+ subc r8, r10
+ mov.l r10, @r4
+ mov.l @r15+, r8 /* high 32-bits of the tmp-sum */
+ mov.l @(4, r4), r10 /* high 32-bits of the sum */
+ subc r8, r10
+ mov.l r10, @(4, r4)
+
+ mov r5, r8
+ mov r0, r10
+ add #64, r0
+ add #64, r0 /* w2 += 32 (* 4): 128 < 0 in 8-bit arith. */
+
+ mov r15, r9 /* r9 = &sum2 */
+ sh_sum8_macs r9, r0, r5, r3 /* tmp = -sum2 is now at the top of the stack */
+ mov r8, r5 /* restore p */
+
+ round_sample r4, r0, r3
+
+ mov.w r0, @r7 /* *samples = round_samples() */
+ add r2, r7 /* samples += incr (* 2) */
+
+ clrt
+
+ mov.l @r15+, r0 /* low 32-bits */
+ mov.l @r4, r3
+ subc r0, r3
+ mov.l r3, @r4
+ mov.l @r15+, r0 /* high 32-bits */
+ mov.l @(4, r4), r3
+ subc r0, r3
+
+ mov.l r3, @(4, r4)
+ round_sample r4, r0, r3
+
+ mov.w r0, @r11 /* *samples2 = round_samples() */
+ sub r2, r11 /* samples2 -= incr (* 2) */
+
+ sub r1, r5 /* synth_buf = p - j */
+ mov r10, r0 /* restore w2 */
+ add #-4, r0 /* w2-- */
+
+ add #-4, r1
+ cmp/pl r1
+ bf 5f
+ bra 4b
+ /* CAREFUL: The below w++ is still in the loop - it's in the delay window */
+5: add #4, r6 /* w++ */
+
+ mov #0, r10
+ mov.l r10, @-r15 /* int64_t tmp = 0 */
+ mov.l r10, @-r15
+ mov r15, r10 /* r10 = &tmp */
+
+ add #64, r6
+ add #64, r6 /* w += 32 (* 4): 128 < 0 in 8-bit arith. */
+
+ mov r15, r10 /* r10 = &tmp */
+ sh_sum8_macs r10, r6, r5, r3 /* tmp = -sum2 is now at the top of the stack */
+
+ clrt
+
+ /* sum -= tmp - 64-bit signed */
+ mov.l @r15+, r8 /* low 32-bits of the tmp-sum */
+ mov.l @r4, r9 /* low 32-bits of the sum */
+ subc r8, r9
+ mov.l r9, @r4
+ mov.l @r15+, r8 /* high 32-bits of the tmp-sum */
+ mov.l @(4, r4), r9 /* high 32-bits of the sum */
+ subc r8, r9
+
+ mov.l r9, @(4, r4)
+ round_sample r4, r0, r1
+
+ mov.w r0, @r7 /* *samples = round_samples() */
+
+ lds.l @r15+, pr
+ mov.l @r15+, r11
+ mov.l @r15+, r10
+ mov.l @r15+, r9
+
+ rts
+ mov.l @r15+, r8 /* delay window */
+
+ .size mp3_win_loop_sh4, .-mp3_win_loop_sh4
+
+ .align 4
+max: .long OUT_MAX
+min: .long OUT_MIN
+m23: .long (1 << OUT_SHIFT) - 1
More information about the ffmpeg-devel
mailing list