[FFmpeg-devel] [PATCH] ATRAC3+ decoder, 2nd try

Mon Oct 21 18:10:38 CEST 2013

On Sat, Oct 19, 2013 at 12:37 AM, Maxim Polijakowski <max_pole at gmx.de>wrote:

> Hello guys,
>
> due to a bunch of comments and fixes I've decided to start a new thread.
> Please find attached an updated patch.
>
> Below a detailed overview of what has been changed:
>
> --> init_get_bits has been replaced with init_get_bits8
> --> Fixed a memleak caused by allocation of ch_units
> --> Added several missing static const
> --> VLC tables have been made static. There is no dynamic allocation and
> deallocation anymore
> --> Loops simplifications and some other minor code refractions
> --> Several "delta = (delta_bits ? get_bits(gb, delta_bits) : 0)" have
> been made a macro
> --> Added several safety checks before reading from bitstream
> --> decode_frame now returns avpkt->size instead of avctx->block_align
> --> replace amp_mant_table with direct computation
> --> AVFloatDSPContext has been moved to the decoder context
> --> Several decoding tables have been aligned to be SIMD-friendly
> --> IMDCT windowing have been reworked to use ff_sine_64 directly
> --> Replace several loops with float vector operations
> --> Fix a bug causing random decoder crashes
> --> better support for mono samples coming form PSP
>

Nice, I found the time to look a little more at it and have a few more
comments:

+    for (i = 0; i < ctx->num_channel_blocks; i++) {
+        for (ch = 0; ch < 2; ch++) {
+            ctx->ch_units[i].channels[ch].ch_num          = ch;
+            ctx->ch_units[i].channels[ch].wnd_shape       =
&ctx->ch_units[i].channels[ch].wnd_shape_hist[0][0];
+            ctx->ch_units[i].channels[ch].wnd_shape_prev  =
&ctx->ch_units[i].channels[ch].wnd_shape_hist[1][0];
+            ctx->ch_units[i].channels[ch].gain_data       =
&ctx->ch_units[i].channels[ch].gain_data_hist[0][0];
+            ctx->ch_units[i].channels[ch].gain_data_prev  =
&ctx->ch_units[i].channels[ch].gain_data_hist[1][0];
+            ctx->ch_units[i].channels[ch].tones_info      =
&ctx->ch_units[i].channels[ch].tones_info_hist[0][0];
+            ctx->ch_units[i].channels[ch].tones_info_prev =
&ctx->ch_units[i].channels[ch].tones_info_hist[1][0];
+

+            /* clear IMDCT overlapping buffer */
+            memset(&ctx->ch_units[i].prev_buf[ch][0], 0,
+                   sizeof(ctx->ch_units[i].prev_buf[ch][0]) *
+                   ATRAC3P_FRAME_SAMPLES);
+            /* clear IPQF history */
+            memset(&ctx->ch_units[i].ipqf_ctx[ch], 0,
+                   sizeof(ctx->ch_units[i].ipqf_ctx[ch]));

This zeroing is most likely unneeded (the coded context is alloced with
av_mallocz).

+    if (ctx->unit_type == CH_UNIT_STEREO) {
+        for (sb = 0; sb < ctx->num_coded_subbands; sb++) {
+            if (ctx->swap_channels[sb]) {
+                memcpy(tmp, &out[0][sb * ATRAC3P_SUBBAND_SAMPLES],
+                       ATRAC3P_SUBBAND_SAMPLES * sizeof(*tmp));
+                memcpy(&out[0][sb * ATRAC3P_SUBBAND_SAMPLES],
+                       &out[1][sb * ATRAC3P_SUBBAND_SAMPLES],
+                       ATRAC3P_SUBBAND_SAMPLES *
+                       sizeof(out[0][sb * ATRAC3P_SUBBAND_SAMPLES]));
+                memcpy(&out[1][sb * ATRAC3P_SUBBAND_SAMPLES], tmp,
+                       ATRAC3P_SUBBAND_SAMPLES *
+                       sizeof(out[1][sb * ATRAC3P_SUBBAND_SAMPLES]));
+            }

This is probably faster and simpler this way

for (j=0; j < ATRAC3P_SUBBAND_SAMPLES; j++)
    FFSWAP(float, out[0][sb * ATRAC3P_SUBBAND_SAMPLES + j], out[1][sb *
ATRAC3P_SUBBAND_SAMPLES + j]);

+    if (ctx->mute_flag)
+        for (ch = 0; ch < num_channels; ch++)
+            memset(out[ch], 0, ATRAC3P_FRAME_SAMPLES * sizeof(*out[ch]));

This can checked earlier (no need to calculate everything just for zeroing
it later).

+static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx
*ch_unit,
+                              int num_channels, AVCodecContext *avctx)
+{
+    int ch, sb;
+
+    for (ch = 0; ch < num_channels; ch++) {
+        for (sb = 0; sb < ch_unit->num_subbands; sb++) {
+            /* inverse transform and windowing */
+            ff_atrac3p_imdct(&ctx->fdsp, &ctx->mdct_ctx,
+                             &ctx->samples[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+                             &ctx->mdct_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+                             (ch_unit->channels[ch].wnd_shape_prev[sb] <<
1) +
+                             ch_unit->channels[ch].wnd_shape[sb], sb);
+
+            /* gain compensation and overlapping */
+            ff_atrac_gain_compensation(&ctx->gainc_ctx,
+                                       &ctx->mdct_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+                                       &ch_unit->prev_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES],
+
&ch_unit->channels[ch].gain_data_prev[sb],
+
&ch_unit->channels[ch].gain_data[sb],
+                                       ATRAC3P_SUBBAND_SAMPLES,
+                                       &ctx->time_buf[ch][sb *
ATRAC3P_SUBBAND_SAMPLES]);
+        }
+
+        /* zero unused subbands in both output and overlapping buffers */
+        memset(&ch_unit->prev_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES],
+               0,
+               (ATRAC3P_SUBBANDS - ch_unit->num_subbands) *
+               ATRAC3P_SUBBAND_SAMPLES *
+               sizeof(ch_unit->prev_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES]));
+        memset(&ctx->time_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES],
+               0,
+               (ATRAC3P_SUBBANDS - ch_unit->num_subbands) *
+               ATRAC3P_SUBBAND_SAMPLES *
+               sizeof(ctx->time_buf[ch][ch_unit->num_subbands *
ATRAC3P_SUBBAND_SAMPLES]));

Can the memsets be avoided by simply ignoring the buffers' previous values
in the function that actually fill them up?

For example, in ff_atrac3p_generate_tones(), replacing

+    /* Overlap and add to residual */
+    for (i = 0; i < 128; i++)
+        out[i] += wavreg1[i] + wavreg2[i];

by

    /* Overlap and add to residual */
    for (i = 0; i < 128; i++)
        out[i] = wavreg1[i] + wavreg2[i];

It also saves you one sum operation.

+
+#define TWOPI (2 * M_PI)

I'm not sure this define improves readability in any way.

+    /* generate sine wave table */
+    for (i = 0; i < 2048; i++)
+        sine_table[i] = sin(TWOPI * i / 2048);

...

+        inc = wave_param->freq_index;
+        pos = DEQUANT_PHASE(wave_param->phase_index) - (reg_offset ^ 128)
* inc & 0x7FF;

If you replace "& 0x7FF" by "& 2047", it will be more obvious why you are
doing it.

+void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit,
AVFloatDSPContext *fdsp,
+                               int ch_num, int sb, float *out)
+{
+    DECLARE_ALIGNED(32, float, wavreg1)[128] = { 0 };
+    DECLARE_ALIGNED(32, float, wavreg2)[128] = { 0 };
+    int i, reg1_env_nonzero, reg2_env_nonzero;
+    Atrac3pWavesData *tones_now  =
&ch_unit->channels[ch_num].tones_info_prev[sb];
+    Atrac3pWavesData *tones_next =
&ch_unit->channels[ch_num].tones_info[sb];
+
+    /* reconstruct full envelopes for both overlapping regions
+     * from truncated bitstream data */
+    if (tones_next->pend_env.has_start_point &&
+        tones_next->pend_env.start_pos < tones_next->pend_env.stop_pos) {
+        tones_next->curr_env.has_start_point = 1;
+        tones_next->curr_env.start_pos       =
tones_next->pend_env.start_pos + 32;
+    } else if (tones_now->pend_env.has_start_point) {
+        tones_next->curr_env.has_start_point = 1;
+        tones_next->curr_env.start_pos       =
tones_now->pend_env.start_pos;
+    } else {
+        tones_next->curr_env.has_start_point = 0;
+        tones_next->curr_env.start_pos       = 0;
+    }
+
+    if (tones_now->pend_env.has_stop_point &&
+        tones_now->pend_env.stop_pos >= tones_next->curr_env.start_pos) {
+        tones_next->curr_env.has_stop_point = 1;
+        tones_next->curr_env.stop_pos       = tones_now->pend_env.stop_pos;
+    } else if (tones_next->pend_env.has_stop_point) {
+        tones_next->curr_env.has_stop_point = 1;
+        tones_next->curr_env.stop_pos       =
tones_next->pend_env.stop_pos + 32;
+    } else {
+        tones_next->curr_env.has_stop_point = 0;
+        tones_next->curr_env.stop_pos       = 64;
+    }
+
+    /* is the visible part of the envelope non-zero? */
+    reg1_env_nonzero = (tones_now->curr_env.stop_pos    < 32) ? 0 : 1;
+    reg2_env_nonzero = (tones_next->curr_env.start_pos >= 32) ? 0 : 1;
+
+    /* synthesize waves for both overlapping regions */
+    if (tones_now->num_wavs && reg1_env_nonzero)
+        waves_synth(ch_unit->waves_info_prev, tones_now,
&tones_now->curr_env,
+                    ch_unit->waves_info_prev->phase_shift[sb] & ch_num,
+                    128, wavreg1);
+
+    if (tones_next->num_wavs && reg2_env_nonzero)
+        waves_synth(ch_unit->waves_info, tones_next, &tones_next->curr_env,
+                    ch_unit->waves_info->phase_shift[sb] & ch_num, 0,
wavreg2);
+

+    /* Hann windowing for non-faded wave signals */
+    if (tones_now->num_wavs && tones_next->num_wavs &&
+        reg1_env_nonzero && reg2_env_nonzero) {
+        fdsp->vector_fmul(wavreg1, wavreg1, &hann_window[128], 128);
+        fdsp->vector_fmul(wavreg2, wavreg2,  hann_window,      128);
+    } else {
+        if (tones_now->num_wavs && !tones_now->curr_env.has_stop_point)
+            fdsp->vector_fmul(wavreg1, wavreg1, &hann_window[128], 128);
+
+        if (tones_next->num_wavs && !tones_next->curr_env.has_start_point)
+            fdsp->vector_fmul(wavreg2, wavreg2, hann_window, 128);
+    }
+
+    /* Overlap and add to residual */
+    for (i = 0; i < 128; i++)
+        out[i] += wavreg1[i] + wavreg2[i];
+}

I think it would be better something in the lines of (WARNING: untested,
probably wrong)

    /* Hann windowing for non-faded wave signals */
    if (tones_now->num_wavs && tones_next->num_wavs &&
        reg1_env_nonzero && reg2_env_nonzero) {
        fdsp->vector_fmul_window(out, wavreg1, wavreg2, hann_window, 128);
    } else if (!tones_now->num_wavs || (!tones_now->curr_env.has_stop_point
&& !tones_next->curr_env.has_start_point)) {
        memset(out, 0, 256*sizeof(*out));
    } else if (tones_next->curr_env.has_stop_point){
        fdsp->vector_fmul_reverse(wavreg1, wavreg1, hann_window, 128);
    } else if (tones_now->curr_env.has_start_point){
        fdsp->vector_fmul(out, wavreg2, hann_window, 128);
    }

One additional advantage of doing this way would be to have a half as long
hann_window buffer (since hann_window[i] == hann_window[256-i]).

+/* First half of the 384-tap IPQF filtering coefficients. */
+static const float ipqf_coeffs1[ATRAC3P_PQF_FIR_LEN][16] = {
+    { -5.8336207e-7,    -8.0604229e-7,    -4.2005411e-7,    -4.4400572e-8,
+       3.226247e-8,      3.530856e-8,      1.2660377e-8,
0.000010516783,
+      -0.000011838618,   6.005389e-7,      0.0000014333754,
0.0000023108685,
+       0.0000032569742,  0.0000046192422,  0.0000063894258,
0.0000070302972 },

...

+    { -0.0000062990207, -0.0000072701259, -0.000011984052,
-0.000017348082,
+      -0.000019907106,  -0.000021348773,  -0.000021961965,
-0.000012203576,
+      -0.000010840992,   4.6299544e-7,     5.2588763e-7,     2.7792686e-7,
+      -2.3649704e-7,    -0.0000010897784, -9.171448e-7,     -5.22682e-7 }
+};
+
+/* Second half of the 384-tap IPQF filtering coefficients. */
+static const float ipqf_coeffs2[ATRAC3P_PQF_FIR_LEN][16] = {
+    {  5.22682e-7,       9.171448e-7,      0.0000010897784,  2.3649704e-7,
+      -2.7792686e-7,    -5.2588763e-7,    -4.6299544e-7,
0.000010840992,
+      -0.000012203576,  -0.000021961965,  -0.000021348773,
-0.000019907106,
+      -0.000017348082,  -0.000011984052,  -0.0000072701259,
-0.0000062990207 },

...

+    { -0.0000070302972, -0.0000063894258, -0.0000046192422,
-0.0000032569742,
+      -0.0000023108685, -0.0000014333754, -6.005389e-7,
0.000011838618,
+       0.000010516783,   1.2660377e-8,     3.530856e-8,      3.226247e-8,
+      -4.4400572e-8,    -4.2005411e-7,    -8.0604229e-7,    -5.8336207e-7 }
+};

Isn't one table just the other backwards?

-Vitor