[FFmpeg-devel] [PATCH] ATRAC3+ decoder, 2nd try
Vitor Sessak
vitor1001 at gmail.com
Mon Oct 21 18:10:38 CEST 2013
On Sat, Oct 19, 2013 at 12:37 AM, Maxim Polijakowski <max_pole at gmx.de>wrote:
> Hello guys,
>
> due to a bunch of comments and fixes I've decided to start a new thread.
> Please find attached an updated patch.
>
> Below a detailed overview of what has been changed:
>
> --> init_get_bits has been replaced with init_get_bits8
> --> Fixed a memleak caused by allocation of ch_units
> --> Added several missing static const
> --> VLC tables have been made static. There is no dynamic allocation and
> deallocation anymore
> --> Loops simplifications and some other minor code refractions
> --> Several "delta = (delta_bits ? get_bits(gb, delta_bits) : 0)" have
> been made a macro
> --> Added several safety checks before reading from bitstream
> --> decode_frame now returns avpkt->size instead of avctx->block_align
> --> replace amp_mant_table with direct computation
> --> AVFloatDSPContext has been moved to the decoder context
> --> Several decoding tables have been aligned to be SIMD-friendly
> --> IMDCT windowing have been reworked to use ff_sine_64 directly
> --> Replace several loops with float vector operations
> --> Fix a bug causing random decoder crashes
> --> better support for mono samples coming form PSP
>
Nice, I found the time to look a little more at it and have a few more
comments:
+ for (i = 0; i < ctx->num_channel_blocks; i++) {
+ for (ch = 0; ch < 2; ch++) {
+ ctx->ch_units[i].channels[ch].ch_num = ch;
+ ctx->ch_units[i].channels[ch].wnd_shape =
&ctx->ch_units[i].channels[ch].wnd_shape_hist[0][0];
+ ctx->ch_units[i].channels[ch].wnd_shape_prev =
&ctx->ch_units[i].channels[ch].wnd_shape_hist[1][0];
+ ctx->ch_units[i].channels[ch].gain_data =
&ctx->ch_units[i].channels[ch].gain_data_hist[0][0];
+ ctx->ch_units[i].channels[ch].gain_data_prev =
&ctx->ch_units[i].channels[ch].gain_data_hist[1][0];
+ ctx->ch_units[i].channels[ch].tones_info =
&ctx->ch_units[i].channels[ch].tones_info_hist[0][0];
+ ctx->ch_units[i].channels[ch].tones_info_prev =
&ctx->ch_units[i].channels[ch].tones_info_hist[1][0];
+
+ /* clear IMDCT overlapping buffer */
+ memset(&ctx->ch_units[i].prev_buf[ch][0], 0,
+ sizeof(ctx->ch_units[i].prev_buf[ch][0]) *
+ ATRAC3P_FRAME_SAMPLES);
+ /* clear IPQF history */
+ memset(&ctx->ch_units[i].ipqf_ctx[ch], 0,
+ sizeof(ctx->ch_units[i].ipqf_ctx[ch]));
This zeroing is most likely unneeded (the coded context is alloced with
av_mallocz).
+ if (ctx->unit_type == CH_UNIT_STEREO) {
+ for (sb = 0; sb < ctx->num_coded_subbands; sb++) {
+ if (ctx->swap_channels[sb]) {
+ memcpy(tmp, &out[0][sb * ATRAC3P_SUBBAND_SAMPLES],
+ ATRAC3P_SUBBAND_SAMPLES * sizeof(*tmp));
+ memcpy(&out[0][sb * ATRAC3P_SUBBAND_SAMPLES],
+ &out[1][sb * ATRAC3P_SUBBAND_SAMPLES],
+ ATRAC3P_SUBBAND_SAMPLES *
+ sizeof(out[0][sb * ATRAC3P_SUBBAND_SAMPLES]));
+ memcpy(&out[1][sb * ATRAC3P_SUBBAND_SAMPLES], tmp,
+ ATRAC3P_SUBBAND_SAMPLES *
+ sizeof(out[1][sb * ATRAC3P_SUBBAND_SAMPLES]));
+ }
This is probably faster and simpler this way
for (j=0; j < ATRAC3P_SUBBAND_SAMPLES; j++)
FFSWAP(float, out[0][sb * ATRAC3P_SUBBAND_SAMPLES + j], out[1][sb *
ATRAC3P_SUBBAND_SAMPLES + j]);
+ if (ctx->mute_flag)
+ for (ch = 0; ch < num_channels; ch++)
+ memset(out[ch], 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out[ch]));
This can checked earlier (no need to calculate everything just for zeroing
it later).
+static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx
*ch_unit,
+ int num_channels, AVCodecContext *avctx)
+{
+ int ch, sb;
+
+ for (ch = 0; ch < num_channels; ch++) {
+ for (sb = 0; sb < ch_unit->num_subbands; sb++) {
+ /* inverse transform and windowing */
+ ff_atrac3p_imdct(&ctx->fdsp, &ctx->mdct_ctx,
+ &ctx->samples[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+ &ctx->mdct_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+ (ch_unit->channels[ch].wnd_shape_prev[sb] <<
1) +
+ ch_unit->channels[ch].wnd_shape[sb], sb);
+
+ /* gain compensation and overlapping */
+ ff_atrac_gain_compensation(&ctx->gainc_ctx,
+ &ctx->mdct_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+ &ch_unit->prev_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+
&ch_unit->channels[ch].gain_data_prev[sb],
+
&ch_unit->channels[ch].gain_data[sb],
+ ATRAC3P_SUBBAND_SAMPLES,
+ &ctx->time_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES]);
+ }
+
+ /* zero unused subbands in both output and overlapping buffers */
+ memset(&ch_unit->prev_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES],
+ 0,
+ (ATRAC3P_SUBBANDS - ch_unit->num_subbands) *
+ ATRAC3P_SUBBAND_SAMPLES *
+ sizeof(ch_unit->prev_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES]));
+ memset(&ctx->time_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES],
+ 0,
+ (ATRAC3P_SUBBANDS - ch_unit->num_subbands) *
+ ATRAC3P_SUBBAND_SAMPLES *
+ sizeof(ctx->time_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES]));
Can the memsets be avoided by simply ignoring the buffers' previous values
in the function that actually fill them up?
For example, in ff_atrac3p_generate_tones(), replacing
+ /* Overlap and add to residual */
+ for (i = 0; i < 128; i++)
+ out[i] += wavreg1[i] + wavreg2[i];
by
/* Overlap and add to residual */
for (i = 0; i < 128; i++)
out[i] = wavreg1[i] + wavreg2[i];
It also saves you one sum operation.
+
+#define TWOPI (2 * M_PI)
I'm not sure this define improves readability in any way.
+ /* generate sine wave table */
+ for (i = 0; i < 2048; i++)
+ sine_table[i] = sin(TWOPI * i / 2048);
...
+ inc = wave_param->freq_index;
+ pos = DEQUANT_PHASE(wave_param->phase_index) - (reg_offset ^ 128)
* inc & 0x7FF;
If you replace "& 0x7FF" by "& 2047", it will be more obvious why you are
doing it.
+void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit,
AVFloatDSPContext *fdsp,
+ int ch_num, int sb, float *out)
+{
+ DECLARE_ALIGNED(32, float, wavreg1)[128] = { 0 };
+ DECLARE_ALIGNED(32, float, wavreg2)[128] = { 0 };
+ int i, reg1_env_nonzero, reg2_env_nonzero;
+ Atrac3pWavesData *tones_now =
&ch_unit->channels[ch_num].tones_info_prev[sb];
+ Atrac3pWavesData *tones_next =
&ch_unit->channels[ch_num].tones_info[sb];
+
+ /* reconstruct full envelopes for both overlapping regions
+ * from truncated bitstream data */
+ if (tones_next->pend_env.has_start_point &&
+ tones_next->pend_env.start_pos < tones_next->pend_env.stop_pos) {
+ tones_next->curr_env.has_start_point = 1;
+ tones_next->curr_env.start_pos =
tones_next->pend_env.start_pos + 32;
+ } else if (tones_now->pend_env.has_start_point) {
+ tones_next->curr_env.has_start_point = 1;
+ tones_next->curr_env.start_pos =
tones_now->pend_env.start_pos;
+ } else {
+ tones_next->curr_env.has_start_point = 0;
+ tones_next->curr_env.start_pos = 0;
+ }
+
+ if (tones_now->pend_env.has_stop_point &&
+ tones_now->pend_env.stop_pos >= tones_next->curr_env.start_pos) {
+ tones_next->curr_env.has_stop_point = 1;
+ tones_next->curr_env.stop_pos = tones_now->pend_env.stop_pos;
+ } else if (tones_next->pend_env.has_stop_point) {
+ tones_next->curr_env.has_stop_point = 1;
+ tones_next->curr_env.stop_pos =
tones_next->pend_env.stop_pos + 32;
+ } else {
+ tones_next->curr_env.has_stop_point = 0;
+ tones_next->curr_env.stop_pos = 64;
+ }
+
+ /* is the visible part of the envelope non-zero? */
+ reg1_env_nonzero = (tones_now->curr_env.stop_pos < 32) ? 0 : 1;
+ reg2_env_nonzero = (tones_next->curr_env.start_pos >= 32) ? 0 : 1;
+
+ /* synthesize waves for both overlapping regions */
+ if (tones_now->num_wavs && reg1_env_nonzero)
+ waves_synth(ch_unit->waves_info_prev, tones_now,
&tones_now->curr_env,
+ ch_unit->waves_info_prev->phase_shift[sb] & ch_num,
+ 128, wavreg1);
+
+ if (tones_next->num_wavs && reg2_env_nonzero)
+ waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env,
+ ch_unit->waves_info->phase_shift[sb] & ch_num, 0,
wavreg2);
+
+ /* Hann windowing for non-faded wave signals */
+ if (tones_now->num_wavs && tones_next->num_wavs &&
+ reg1_env_nonzero && reg2_env_nonzero) {
+ fdsp->vector_fmul(wavreg1, wavreg1, &hann_window[128], 128);
+ fdsp->vector_fmul(wavreg2, wavreg2, hann_window, 128);
+ } else {
+ if (tones_now->num_wavs && !tones_now->curr_env.has_stop_point)
+ fdsp->vector_fmul(wavreg1, wavreg1, &hann_window[128], 128);
+
+ if (tones_next->num_wavs && !tones_next->curr_env.has_start_point)
+ fdsp->vector_fmul(wavreg2, wavreg2, hann_window, 128);
+ }
+
+ /* Overlap and add to residual */
+ for (i = 0; i < 128; i++)
+ out[i] += wavreg1[i] + wavreg2[i];
+}
I think it would be better something in the lines of (WARNING: untested,
probably wrong)
/* Hann windowing for non-faded wave signals */
if (tones_now->num_wavs && tones_next->num_wavs &&
reg1_env_nonzero && reg2_env_nonzero) {
fdsp->vector_fmul_window(out, wavreg1, wavreg2, hann_window, 128);
} else if (!tones_now->num_wavs || (!tones_now->curr_env.has_stop_point
&& !tones_next->curr_env.has_start_point)) {
memset(out, 0, 256*sizeof(*out));
} else if (tones_next->curr_env.has_stop_point){
fdsp->vector_fmul_reverse(wavreg1, wavreg1, hann_window, 128);
} else if (tones_now->curr_env.has_start_point){
fdsp->vector_fmul(out, wavreg2, hann_window, 128);
}
One additional advantage of doing this way would be to have a half as long
hann_window buffer (since hann_window[i] == hann_window[256-i]).
+/* First half of the 384-tap IPQF filtering coefficients. */
+static const float ipqf_coeffs1[ATRAC3P_PQF_FIR_LEN][16] = {
+ { -5.8336207e-7, -8.0604229e-7, -4.2005411e-7, -4.4400572e-8,
+ 3.226247e-8, 3.530856e-8, 1.2660377e-8,
0.000010516783,
+ -0.000011838618, 6.005389e-7, 0.0000014333754,
0.0000023108685,
+ 0.0000032569742, 0.0000046192422, 0.0000063894258,
0.0000070302972 },
...
+ { -0.0000062990207, -0.0000072701259, -0.000011984052,
-0.000017348082,
+ -0.000019907106, -0.000021348773, -0.000021961965,
-0.000012203576,
+ -0.000010840992, 4.6299544e-7, 5.2588763e-7, 2.7792686e-7,
+ -2.3649704e-7, -0.0000010897784, -9.171448e-7, -5.22682e-7 }
+};
+
+/* Second half of the 384-tap IPQF filtering coefficients. */
+static const float ipqf_coeffs2[ATRAC3P_PQF_FIR_LEN][16] = {
+ { 5.22682e-7, 9.171448e-7, 0.0000010897784, 2.3649704e-7,
+ -2.7792686e-7, -5.2588763e-7, -4.6299544e-7,
0.000010840992,
+ -0.000012203576, -0.000021961965, -0.000021348773,
-0.000019907106,
+ -0.000017348082, -0.000011984052, -0.0000072701259,
-0.0000062990207 },
...
+ { -0.0000070302972, -0.0000063894258, -0.0000046192422,
-0.0000032569742,
+ -0.0000023108685, -0.0000014333754, -6.005389e-7,
0.000011838618,
+ 0.000010516783, 1.2660377e-8, 3.530856e-8, 3.226247e-8,
+ -4.4400572e-8, -4.2005411e-7, -8.0604229e-7, -5.8336207e-7 }
+};
Isn't one table just the other backwards?
-Vitor
More information about the ffmpeg-devel
mailing list