[FFmpeg-soc] [PATCH] AMR-WB Decoder

Mon Sep 6 16:31:18 CEST 2010

On 09/06/2010 02:46 AM, Marcelo Galvăo Póvoa wrote:
> Ok, fortunately I've found the bug!
>

[...]

> So, here is how the decoder is now (I'm sending my commits to github also).
>

> +/** Get x bits in the index interval [lsb,lsb+len-1] inclusive */
> +#define BIT_STR(x,lsb,len) (((x)>>  (lsb))&  ((1<<  (len)) - 1))
> +
> +/** Get the bit at specified position */
> +#define BIT_POS(x, p) (((x)>>  (p))&  1)
>

If you move the #defines closer to where it is used, you improve the
readability of your code.

> +
> +/**
> + * Decodes quantized ISF vectors using 36-bit indices (6K60 mode only)
> + *
> + * @param[in] ind                  Array of 5 indices
> + * @param[out] isf_q               Buffer for isf_q[LP_ORDER]
> + *
> + */
> +static void decode_isf_indices_36b(uint16_t *ind, float *isf_q) {
> +    int i;
> +
> +    for (i = 0; i<  9; i++) {
> +        isf_q[i] = dico1_isf[ind[0]][i] * (1.0f / (1<<  15));
> +    }
> +    for (i = 0; i<  7; i++) {
> +        isf_q[i + 9] = dico2_isf[ind[1]][i] * (1.0f / (1<<  15));
> +    }
> +    for (i = 0; i<  5; i++) {
> +        isf_q[i] += dico21_isf_36b[ind[2]][i] * (1.0f / (1<<  15));
> +    }
> +    for (i = 0; i<  4; i++) {
> +        isf_q[i + 5] += dico22_isf_36b[ind[3]][i] * (1.0f / (1<<  15));
> +    }
> +    for (i = 0; i<  7; i++) {
> +        isf_q[i + 9] += dico23_isf_36b[ind[4]][i] * (1.0f / (1<<  15));
> +    }
> +}
>

I think this would be more readable if the (1.0f / (1 << 15)) rescaling
would be done in isf_add_mean_and_past().

> +
> +/**
> + * Find the pitch vector by interpolating the past excitation at the
> + * pitch delay, which is obtained in this function
> + *
> + * @param[in,out] ctx              The context
> + * @param[in] amr_subframe         Current subframe data
> + * @param[in] subframe             Current subframe index (0 to 3)
> + */
> +static void decode_pitch_vector(AMRWBContext *ctx,
> +                                const AMRWBSubFrame *amr_subframe,
> +                                const int subframe)
> +{
> +    int pitch_lag_int, pitch_lag_frac;
> +    int i;
> +    float *exc     = ctx->excitation;
> +    enum Mode mode = ctx->fr_cur_mode;
> +
> +    if (mode<= MODE_8k85) {
> +        decode_pitch_lag_low(&pitch_lag_int,&pitch_lag_frac, amr_subframe->adap,
> +&ctx->base_pitch_lag, subframe, mode);
> +    } else
> +        decode_pitch_lag_high(&pitch_lag_int,&pitch_lag_frac, amr_subframe->adap,
> +&ctx->base_pitch_lag, subframe);
> +
> +    ctx->pitch_lag_int = pitch_lag_int;
>

> +    pitch_lag_int     += (pitch_lag_frac<  0 ? -1 : 0) + (pitch_lag_frac ? 1 : 0);
>

pitch_lag_int     += pitch_lag_frac ? FFSIGN(pitch_lag_frac) : 0;

> +/**
> + * Reduce fixed vector sparseness by smoothing with one of three IR filters
> + * Also known as "adaptive phase dispersion"
> + *
> + * @param[in] ctx                  The context
> + * @param[in,out] fixed_vector     Unfiltered fixed vector
> + * @param[out] buf                 Space for modified vector if necessary
> + *
> + * @return The potentially overwritten filtered fixed vector address
> + */
> +static float *anti_sparseness(AMRWBContext *ctx,
> +                              float *fixed_vector, float *buf)
> +{
> +    int ir_filter_nr;
> +
> +    if (ctx->fr_cur_mode>  MODE_8k85) // no filtering in higher modes
> +        return fixed_vector;
> +
> +    if (ctx->pitch_gain[0]<  0.6) {
> +        ir_filter_nr = 0;      // strong filtering
> +    } else if (ctx->pitch_gain[0]<  0.9) {
> +        ir_filter_nr = 1;      // medium filtering
> +    } else
> +        ir_filter_nr = 2;      // no filtering
> +
> +    /* detect 'onset' */
> +    if (ctx->fixed_gain[0]>  3.0 * ctx->fixed_gain[1]) {
> +        if (ir_filter_nr<  2)
> +            ir_filter_nr++;
> +    } else {
> +        int i, count = 0;
> +
> +        for (i = 0; i<  6; i++)
> +            if (ctx->pitch_gain[i]<  0.6)
> +                count++;
> +
> +        if (count>  2)
> +            ir_filter_nr = 0;
> +
> +        if (ir_filter_nr>  ctx->prev_ir_filter_nr + 1)
> +            ir_filter_nr--;
> +    }
> +
> +    /* update ir filter strength history */
> +    ctx->prev_ir_filter_nr = ir_filter_nr;
> +
> +    ir_filter_nr += (ctx->fr_cur_mode == MODE_8k85 ? 1 : 0);
> +
> +    if (ir_filter_nr<  2) {
> +        int i, j;
> +        const float *coef = ir_filters_lookup[ir_filter_nr];
> +
> +        /* Circular convolution code in the reference
> +         * decoder was modified to avoid using one
> +         * extra array. The filtered vector is given by:
> +         *
> +         * c2(n) = sum(i,0,len-1){ c(i) * coef( (n - i + len) % len ) }
> +         */
> +
> +        memset(buf, 0, sizeof(float) * AMRWB_SFR_SIZE);
> +        for (i = 0; i<  AMRWB_SFR_SIZE; i++)

> +            if (fixed_vector[i]) {
> +                int li = AMRWB_SFR_SIZE - i;
> +
> +                for (j = 0; j<  li; j++)
> +                    buf[i + j] += fixed_vector[i] * coef[j];
> +
> +                for (j = 0; j<  i; j++)
> +                    buf[j] += fixed_vector[i] * coef[j + li];
> +            }

Can celp_filters.c:ff_celp_circ_addf() simplify this?

> +/**
> + * Extrapolate a ISF vector to the 16kHz range (20th order LP)
> + * used at mode 6k60 LP filter for the high frequency band
> + *
> + * @param[out] out                 Buffer for extrapolated isf
> + * @param[in] isf                  Input isf vector
> + */
> +static void extrapolate_isf(float out[LP_ORDER_16k], float isf[LP_ORDER])
> +{
> +    float diff_isf[LP_ORDER - 2], diff_mean;
> +    float *diff_hi = diff_isf - LP_ORDER + 1; // diff array for extrapolated indices
> +    float corr_lag[3];
> +    float est, scale;
> +    int i, i_max_corr;
> +
> +    memcpy(out, isf, (LP_ORDER - 1) * sizeof(float));
> +    out[LP_ORDER_16k - 1] = isf[LP_ORDER - 1];
> +
> +    /* Calculate the difference vector */
> +    for (i = 0; i<  LP_ORDER - 2; i++)
> +        diff_isf[i] = isf[i + 1] - isf[i];
> +
> +    diff_mean = 0.0;
> +    for (i = 2; i<  LP_ORDER - 2; i++)
> +        diff_mean += diff_isf[i] / (LP_ORDER - 4);

diff_mean += diff_isf[i] * (1.0f / (LP_ORDER - 4));

> +    for (i = LP_ORDER - 1; i<  LP_ORDER_16k - 1; i++)
> +        diff_hi[i] = scale * (out[i] - out[i - 1]);
> +
> +    /* Stability insurance */
> +    for (i = LP_ORDER; i<  LP_ORDER_16k - 1; i++)
> +        if (diff_hi[i] + diff_hi[i - 1]<  5.0) {
> +            if (diff_hi[i]>  diff_hi[i - 1]) {
> +                diff_hi[i - 1] = 5.0 - diff_hi[i];
> +            } else
> +                diff_hi[i] = 5.0 - diff_hi[i - 1];
> +        }
> +
> +    for (i = LP_ORDER - 1; i<  LP_ORDER_16k - 1; i++)
> +        out[i] = out[i - 1] + diff_hi[i] * (1.0f / (1<<  15));

Hmm, this hunk looks a lot like

ff_sort_nearly_sorted_floats(out, LP_ORDER_16k);
ff_set_min_dist_lsf(out, 5.0f / (1<<  15));
for (i = LP_ORDER - 1; i<  LP_ORDER_16k - 1; i++)
     out[i] = out[i - 1] + scale * (1.0f / (1<<  15)) * (out[i] - out[i-1]);

or something like that.

> +/**
> + * Apply to high-band samples a 15th order filter
> + * The filter characteristic depends on the given coefficients
> + *
> + * @param[out] out                 Buffer for filtered output
> + * @param[in] fir_coef             Filter coefficients
> + * @param[in,out] mem              State from last filtering (updated)
> + * @param[in] cp_gain              Compensation gain (usually the filter gain)
> + * @param[in] in                   Input speech data (high-band)
> + *
> + * @remark It is safe to pass the same array in in and out parameters
> + */
> +static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
> +                          float mem[HB_FIR_SIZE], float cp_gain, const float *in)
> +{
> +    int i, j;
> +    float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples
> +
> +    memcpy(data, mem, HB_FIR_SIZE * sizeof(float));
> +
> +    for (i = 0; i<  AMRWB_SFR_SIZE_16k; i++)
> +        data[i + HB_FIR_SIZE] = in[i] / cp_gain;
> +
> +    for (i = 0; i<  AMRWB_SFR_SIZE_16k; i++) {
> +        out[i] = 0.0;
> +        for (j = 0; j<= HB_FIR_SIZE; j++)
> +            out[i] += data[i + j] * fir_coef[j];
> +    }
> +
> +    memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
> +}

This function does too much memcpy: 30+30+80 elements. I suggest that
you use a buffer HB_FIR_SIZE elements larger where is is called, so
that the memcpy(data, mem, etc) is not needed.

Another thing is that the scaling by cp_gain is not needed. You can
replace it by multiplying the bpf_6_7_coef table by 4.

Finally, I would prefer if the function contain just the inner loop
(leaving the memory updating for the caller). This way it is simpler
to reuse it in other codecs in the future. Since it is also a very
good target for optimization (AMRWB spend>  30% of the decoding time
on it), it is nice to have a simple interface the day someone decides
to optimize it.

> +static int amrwb_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
> +                              AVPacket *avpkt)
> +{
> +    AMRWBContext *ctx  = avctx->priv_data;
> +    AMRWBFrame   *cf   =&ctx->frame;
> +    const uint8_t *buf = avpkt->data;
> +    int buf_size       = avpkt->size;
> +    float *buf_out = data;
> +    float spare_vector[AMRWB_SFR_SIZE];      // extra stack space to hold result from anti-sparseness processing
> +    float fixed_gain_factor;                 // fixed gain correction factor (gamma)
> +    float *synth_fixed_vector;               // pointer to the fixed vector that synthesis should use
> +    float synth_fixed_gain;                  // the fixed gain that synthesis should use
> +    float voice_fac, stab_fac;               // parameters used for gain smoothing
> +    float synth_exc[AMRWB_SFR_SIZE];         // post-processed excitation for synthesis
> +    float hb_exc[AMRWB_SFR_SIZE_16k];        // excitation for the high frequency band
> +    float hb_samples[AMRWB_SFR_SIZE_16k];    // filtered high-band samples from synthesis
> +    float hb_gain;
> +    int sub, i;
> +
> +    ctx->fr_cur_mode = unpack_bitstream(ctx, buf, buf_size);

Missing a call like

     if (buf_size<  expected_input_size) {
         av_log(avctx, AV_LOG_ERROR,
                "Frame too small (%d bytes). Truncated file?\n", buf_size);
         *data_size = 0;
         return buf_size;
     }

To avoid segfaults for truncated files.

> +        /* Add the low and high frequency bands */
> +        for (i = 0; i<  AMRWB_SFR_SIZE_16k; i++) {
> +            // XXX: the low-band should really be upscaled by 2.0? This
> +            // way the output volume level seems doubled
> +            sub_buf[i] = (sub_buf[i] + hb_samples[i]) / 32768.0;

* (1.0f / (1<<  15));

-Vitor