[FFmpeg-devel] [PATCH] ATRAC3+ decoder, 2nd try

Mon Nov 25 00:17:13 CET 2013

On Nov 19, 2013 11:52 AM, "Maxim Polijakowski" <max_pole at gmx.de> wrote:
>
> Hello Vitor,
>
>
> a new patch has been sent in a separate mail.
>
> My comments are located below...
>
>>> [...]
>>>
>>>
>>> Nice, I found the time to look a little more at it and have a few more
>>> comments:
>>>
>>> +    for (i = 0; i < ctx->num_channel_blocks; i++) {
>>> +        for (ch = 0; ch < 2; ch++) {
>>> +            ctx->ch_units[i].channels[ch].ch_num          = ch;
>>> +            ctx->ch_units[i].channels[ch].wnd_shape       =
>>> &ctx->ch_units[i].channels[ch].wnd_shape_hist[0][0];
>>> +            ctx->ch_units[i].channels[ch].wnd_shape_prev  =
>>> &ctx->ch_units[i].channels[ch].wnd_shape_hist[1][0];
>>> +            ctx->ch_units[i].channels[ch].gain_data       =
>>> &ctx->ch_units[i].channels[ch].gain_data_hist[0][0];
>>> +            ctx->ch_units[i].channels[ch].gain_data_prev  =
>>> &ctx->ch_units[i].channels[ch].gain_data_hist[1][0];
>>> +            ctx->ch_units[i].channels[ch].tones_info      =
>>> &ctx->ch_units[i].channels[ch].tones_info_hist[0][0];
>>> +            ctx->ch_units[i].channels[ch].tones_info_prev =
>>> &ctx->ch_units[i].channels[ch].tones_info_hist[1][0];
>>> +
>>>
>>> +            /* clear IMDCT overlapping buffer */
>>> +            memset(&ctx->ch_units[i].prev_buf[ch][0], 0,
>>> +                   sizeof(ctx->ch_units[i].prev_buf[ch][0]) *
>>> +                   ATRAC3P_FRAME_SAMPLES);
>>> +            /* clear IPQF history */
>>> +            memset(&ctx->ch_units[i].ipqf_ctx[ch], 0,
>>> +                   sizeof(ctx->ch_units[i].ipqf_ctx[ch]));
>>>
>>> This zeroing is most likely unneeded (the coded context is alloced with
>>> av_mallocz).
>
>
> Yes, that zeroing has been removed...
>
>
>> +    if (ctx->unit_type == CH_UNIT_STEREO) {
>> +        for (sb = 0; sb < ctx->num_coded_subbands; sb++) {
>> +            if (ctx->swap_channels[sb]) {
>> +                memcpy(tmp, &out[0][sb * ATRAC3P_SUBBAND_SAMPLES],
>> +                       ATRAC3P_SUBBAND_SAMPLES * sizeof(*tmp));
>> +                memcpy(&out[0][sb * ATRAC3P_SUBBAND_SAMPLES],
>> +                       &out[1][sb * ATRAC3P_SUBBAND_SAMPLES],
>> +                       ATRAC3P_SUBBAND_SAMPLES *
>> +                       sizeof(out[0][sb * ATRAC3P_SUBBAND_SAMPLES]));
>> +                memcpy(&out[1][sb * ATRAC3P_SUBBAND_SAMPLES], tmp,
>> +                       ATRAC3P_SUBBAND_SAMPLES *
>> +                       sizeof(out[1][sb * ATRAC3P_SUBBAND_SAMPLES]));
>> +            }
>>
>> This is probably faster and simpler this way
>>
>> for (j=0; j < ATRAC3P_SUBBAND_SAMPLES; j++)
>>      FFSWAP(float, out[0][sb * ATRAC3P_SUBBAND_SAMPLES + j], out[1][sb *
>> ATRAC3P_SUBBAND_SAMPLES + j]);
>
>
> Yes, this is simpler indeed. Therefore I've changed the code accordingly.
>
> Regarding the speed I didn't notice any improvements. Such an improvement
would be a function in the float DSP that quickly swaps two memory vectors.
Unfortunately, there is no such a function but we could add smth like that:
>
> fdsp->vector_swap_vectors(float *v1, float *v2, int direction)
>
> The last parameter gives the direction which the 2nd pointer will be
estimated to: +1 - forwards, -1 = backwards. Having such a function will
help to speed up several other parts of the existing ATRAC decoders and
possible even more.
>
> I could elaborate a PowerPC Altivec implementation. I'm not good on x86
and ARM though...

These look like good ideas, but I think it is better to wait until the
C-only code is committed.

>
>
>> +    if (ctx->mute_flag)
>> +        for (ch = 0; ch < num_channels; ch++)
>> +            memset(out[ch], 0, ATRAC3P_FRAME_SAMPLES *
sizeof(*out[ch]));
>>
>> This can checked earlier (no need to calculate everything just for
zeroing
>> it later).
>
>
> Done.

Just to check: you do not skip decoding stuff whose memory will be used in
the next frame?

>
>> +static void reconstruct_frame(ATRAC3PContext *ctx, Atrac3pChanUnitCtx
>> *ch_unit,
>> +                              int num_channels, AVCodecContext *avctx)
>> +{
>> +    int ch, sb;
>> +
>> +    for (ch = 0; ch < num_channels; ch++) {
>> +        for (sb = 0; sb < ch_unit->num_subbands; sb++) {
>> +            /* inverse transform and windowing */
>> +            ff_atrac3p_imdct(&ctx->fdsp, &ctx->mdct_ctx,
>> +                             &ctx->samples[ch][sb *
>> ATRAC3P_SUBBAND_SAMPLES],
>> +                             &ctx->mdct_buf[ch][sb *
>> ATRAC3P_SUBBAND_SAMPLES],
>> +                             (ch_unit->channels[ch].wnd_shape_prev[sb]
<<
>> 1) +
>> +                             ch_unit->channels[ch].wnd_shape[sb], sb);
>> +
>> +            /* gain compensation and overlapping */
>> +            ff_atrac_gain_compensation(&ctx->gainc_ctx,
>> +                                       &ctx->mdct_buf[ch][sb *
>> ATRAC3P_SUBBAND_SAMPLES],
>> +                                       &ch_unit->prev_buf[ch][sb *
>> ATRAC3P_SUBBAND_SAMPLES],
>> +
>> &ch_unit->channels[ch].gain_data_prev[sb],
>> +
>> &ch_unit->channels[ch].gain_data[sb],
>> +                                       ATRAC3P_SUBBAND_SAMPLES,
>> +                                       &ctx->time_buf[ch][sb *
>> ATRAC3P_SUBBAND_SAMPLES]);
>> +        }
>> +
>> +        /* zero unused subbands in both output and overlapping buffers
*/
>> +        memset(&ch_unit->prev_buf[ch][ch_unit->num_subbands *
>> ATRAC3P_SUBBAND_SAMPLES],
>> +               0,
>> +               (ATRAC3P_SUBBANDS - ch_unit->num_subbands) *
>> +               ATRAC3P_SUBBAND_SAMPLES *
>> +               sizeof(ch_unit->prev_buf[ch][ch_unit->num_subbands *
>> ATRAC3P_SUBBAND_SAMPLES]));
>> +        memset(&ctx->time_buf[ch][ch_unit->num_subbands *
>> ATRAC3P_SUBBAND_SAMPLES],
>> +               0,
>> +               (ATRAC3P_SUBBANDS - ch_unit->num_subbands) *
>> +               ATRAC3P_SUBBAND_SAMPLES *
>> +               sizeof(ctx->time_buf[ch][ch_unit->num_subbands *
>> ATRAC3P_SUBBAND_SAMPLES]));
>>
>> Can the memsets be avoided by simply ignoring the buffers' previous
values
>> in the function that actually fill them up?
>>
>> For example, in ff_atrac3p_generate_tones(), replacing
>>
>> +    /* Overlap and add to residual */
>> +    for (i = 0; i < 128; i++)
>> +        out[i] += wavreg1[i] + wavreg2[i];
>>
>> by
>>
>>      /* Overlap and add to residual */
>>      for (i = 0; i < 128; i++)
>>          out[i] = wavreg1[i] + wavreg2[i];
>>
>> It also saves you one sum operation.
>
>
> I like the idea but I'm afraid this won't work. Sine tones are usually
limited to the lower spectral bands (0...2 KHz). These bands tend to
contain non-zero data. Zeroing is usually done in the high spectral bands
with no sine waves...

Is it trivial to only memset what is needed?

>
>
>
>> +
>> +#define TWOPI (2 * M_PI)
>>
>> I'm not sure this define improves readability in any way.
>
>
> Perhaps not. I've seen this definition several times in other projects
related to synthesizers and therefore considered this to be a standard...
>
>
>
>> +    /* generate sine wave table */
>> +    for (i = 0; i < 2048; i++)
>> +        sine_table[i] = sin(TWOPI * i / 2048);
>>
>> ...
>>
>> +        inc = wave_param->freq_index;
>> +        pos = DEQUANT_PHASE(wave_param->phase_index) - (reg_offset ^
128)
>> * inc & 0x7FF;
>>
>> If you replace "& 0x7FF" by "& 2047", it will be more obvious why you are
>> doing it.
>
>
> Ok, replaced...
>
>> [...]
>>
>>
>>      /* Hann windowing for non-faded wave signals */
>>      if (tones_now->num_wavs && tones_next->num_wavs &&
>>          reg1_env_nonzero && reg2_env_nonzero) {
>>          fdsp->vector_fmul_window(out, wavreg1, wavreg2, hann_window,
128);
>>      } else if (!tones_now->num_wavs ||
(!tones_now->curr_env.has_stop_point
>> && !tones_next->curr_env.has_start_point)) {
>>          memset(out, 0, 256*sizeof(*out));
>>      } else if (tones_next->curr_env.has_stop_point){
>>          fdsp->vector_fmul_reverse(wavreg1, wavreg1, hann_window, 128);
>>      } else if (tones_now->curr_env.has_start_point){
>>          fdsp->vector_fmul(out, wavreg2, hann_window, 128);
>>      }
>>
>> One additional advantage of doing this way would be to have a half as
long
>> hann_window buffer (since hann_window[i] == hann_window[256-i]).
>
>
> Unofortunately, this hann window is abit asymmetric: hann_window[256] =
1.0
> This leads to hann_window[i] != hann_window[256-i].
>
> I had to write the reversed part as follows:
>
> fdsp->vector_fmul_reverse(wavreg1 + 1, wavreg1 + 1, hann_window, 128)
>
> but that will bomb out at least on x86 SSE due to unaligned memory
access...
>

Ok, I'll give this a second look but this can wait for when the code is
committed.

>
>> +/* First half of the 384-tap IPQF filtering coefficients. */
>> +static const float ipqf_coeffs1[ATRAC3P_PQF_FIR_LEN][16] = {
>> +    { -5.8336207e-7,    -8.0604229e-7,    -4.2005411e-7,
 -4.4400572e-8,
>> +       3.226247e-8,      3.530856e-8,      1.2660377e-8,
>> 0.000010516783,
>> +      -0.000011838618,   6.005389e-7,      0.0000014333754,
>> 0.0000023108685,
>> +       0.0000032569742,  0.0000046192422,  0.0000063894258,
>> 0.0000070302972 },
>>
>> ...
>>
>> +    { -0.0000062990207, -0.0000072701259, -0.000011984052,
>> -0.000017348082,
>> +      -0.000019907106,  -0.000021348773,  -0.000021961965,
>> -0.000012203576,
>> +      -0.000010840992,   4.6299544e-7,     5.2588763e-7,
2.7792686e-7,
>> +      -2.3649704e-7,    -0.0000010897784, -9.171448e-7,     -5.22682e-7
}
>> +};
>> +
>> +/* Second half of the 384-tap IPQF filtering coefficients. */
>> +static const float ipqf_coeffs2[ATRAC3P_PQF_FIR_LEN][16] = {
>> +    {  5.22682e-7,       9.171448e-7,      0.0000010897784,
 2.3649704e-7,
>> +      -2.7792686e-7,    -5.2588763e-7,    -4.6299544e-7,
>> 0.000010840992,
>> +      -0.000012203576,  -0.000021961965,  -0.000021348773,
>> -0.000019907106,
>> +      -0.000017348082,  -0.000011984052,  -0.0000072701259,
>> -0.0000062990207 },
>>
>> ...
>>
>> +    { -0.0000070302972, -0.0000063894258, -0.0000046192422,
>> -0.0000032569742,
>> +      -0.0000023108685, -0.0000014333754, -6.005389e-7,
>> 0.000011838618,
>> +       0.000010516783,   1.2660377e-8,     3.530856e-8,
 3.226247e-8,
>> +      -4.4400572e-8,    -4.2005411e-7,    -8.0604229e-7,
 -5.8336207e-7 }
>> +};
>>
>> Isn't one table just the other backwards?
>
>
> No, even sub-arrays have their signs flipped. IMHO, doing this in the
code would unnecessarily complicate and perhaps slow down the whole
function.
> Once again, if we want to improve speed we should consider a SIMD
implementation...

Ok, but then you should either generate the second table from the first in
a static array or leave it hardcoded but add a comment explaining how they
are related.

And no further comments from me.

-Vitor