[FFmpeg-devel] [PATCH] WMA Voice decoder

Sat Feb 6 17:19:49 CET 2010

Ronald S. Bultje wrote:
> Hi,
>
> On Tue, Feb 2, 2010 at 11:35 AM, Ronald S. Bultje <rsbultje at gmail.com> wrote:
>   
>> (Work on aw_*() is still ongoing...)
>>     
>
> I have something without the crazy loops and using av_log2(), maybe
> this is better?
>   

> +/**
> + * Parse 10 independently-coded LSPs.
> + */
> +static void dequant_lsp10i(GetBitContext *gb, double *lsps)
> +{
> +    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
> +    static const double mul_lsf[4] = {
> +        5.2187144800e-3,    1.4626986422e-3,
> +        9.6179549166e-4,    1.1325736225e-3
> +    };
> +    static const double base_lsf[4] = {
> +        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
> +        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
> +    };
>   

I think just putting the values multiplied by pi would be more readable.

> +#define NO_OFFSET -255
> +/**
> + * Parse the offset of the first pitch-adaptive window pulses, and
> + * the distribution of pulses between the two blocks in this frame.
> + * @param ctx WMA Voice decoding context
> + * @param gb bit I/O context
> + * @param pitch pitch for each block in this frame
> + */
> +static void aw_parse_coords(AVCodecContext *ctx, GetBitContext *gb,
> +                            const int *pitch)
> +{
> +    static const int16_t start_offset[94] = {
> +        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
> +         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
> +         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
> +         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
> +         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
> +         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
> +        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
> +        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
> +    };
> +    WMAVoiceContext *s = ctx->priv_data;
> +    int bits, n, offset, off_table[11], first_idx[2];
> +
> +    s->aw_idx_is_ext = 0;
> +    if ((bits = get_bits(gb, 6)) >= 54) {
> +        s->aw_idx_is_ext = 1;
> +        bits += (bits - 54) * 3 + get_bits(gb, 2);
> +    }
> +    s->aw_pitch_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
> +
> +    offset = start_offset[bits];
> +    for (n = 0; n < 11 && offset < MAX_FRAMESIZE; n++) {
> +        off_table[n] = offset;
> +        offset += pitch[offset >= MAX_FRAMESIZE / 2];
> +    }
> +    if (n < 11)
> +        memset(&off_table[n], -1, (11 - n) * sizeof(int));
>   

-1 or NO_OFFSET?

> +
> +    s->aw_n_pulses[0]        = s->aw_n_pulses[1]        = 0;
>   

> +    s->aw_first_pulse_off[0] = s->aw_first_pulse_off[1] = NO_OFFSET;
>   

Is this initialization really needed?

> +/**
> + * Parse FCB/ACB signal for a single block.
> + * @note see #synth_block().
> + */
> +static void synth_block_fcb_acb(AVCodecContext *ctx, GetBitContext *gb,
> +                                int block_idx, int size,
> +                                int block_pitch_sh2,
> +                                const struct frame_type_desc *frame_desc,
> +                                float *excitation)
> +{
> +    static const float gain_coeff[6] = {
> +        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
> +    };
> +    WMAVoiceContext *s = ctx->priv_data;
> +    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
> +    int n, idx, gain_weight;
> +    AMRFixed fcb;
> +
> +    assert(size <= MAX_FRAMESIZE / 2);
> +    memset(pulses, 0, sizeof(*pulses) * size);
> +
> +    fcb.pitch_lag      = block_pitch_sh2 >> 2;
> +    fcb.pitch_fac      = 1.0;
> +    fcb.no_repeat_mask = 0;
> +    fcb.n              = 0;
> +
> +    /* For the other frame types, this is where we apply the innovation
> +     * (fixed) codebook pulses of the speech signal. */
> +    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
> +        aw_pulse_set1(s, gb, block_idx, &fcb);
> +        aw_pulse_set2(s, gb, block_idx, &fcb);
> +    } else /* FCB_TYPE_EXC_PULSES */ {
> +        int offset_nbits = 5 - frame_desc->log_n_blocks;
> +
> +        fcb.no_repeat_mask = -1;
> +        for (n = 0; n < 5; n++) {
> +            float pulse = get_bits1(gb) ? 1.0 : -1.0;
> +            int idx1, idx2;
> +
> +            idx1           = get_bits(gb, offset_nbits);
> +            fcb.x[fcb.n]   = n + 5 * idx1;
> +            fcb.y[fcb.n++] = pulse;
> +            if (n < frame_desc->dbl_pulses) {
> +                idx2           = get_bits(gb, offset_nbits);
> +                fcb.x[fcb.n]   = n + 5 * idx2;
> +                fcb.y[fcb.n++] = (idx1 >= idx2) ? pulse : -pulse;
> +            }
> +        }
> +    }
>   

The else{} case is very close to ff_decode_10_pulses_35bits().

> +/**
> + * Synthesize output samples for a single frame.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoder context
> + * @param gb bit I/O context (s->gb or one for cross-packet superframes)
> + * @param samples pointer to output sample buffer, has space for at least 160
> + *                samples
> + * @param lsps LSP array
> + * @param prev_lsps array of previous frame's LSPs
> + * @param excitation target buffer for excitation signal
> + * @param synth target buffer for synthesized speech data
> + * @return 0 on success, <0 on error.
> + */
> +static int synth_frame(AVCodecContext *ctx, GetBitContext *gb,
> +                       float *samples,
> +                       const double *lsps, const double *prev_lsps,
> +                       float *excitation, float *synth)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
> +    int pitch[MAX_BLOCKS], last_block_pitch;
> +
> +    /* Parse frame type ("frame header"), see #frame_descs */
> +    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
> +        block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
> +
> +    if (bd_idx < 0) {
> +        av_log(ctx, AV_LOG_ERROR,
> +               "Invalid frame type VLC code, skipping\n");
> +        return -1;
> +    }
> +
> +    /*
> +     * Pitch (per ACB type):
> +     * - type 0: unused
> +     * - type 1: provided (globally) for the whole frame. In #synth_block(),
> +     *            we derive the "pitch-per-sample" for adaptive codebook
> +     *            reading.
> +     * - type 2: provided per block (see just before the call to
> +     *            #synth_block()), so not read here.
> +     */
>   

I think it would be more readable if you move the comments inside the 
"case ACB_XX:" statements

> +    switch (frame_descs[bd_idx].acb_type) {
> +    case ACB_TYPE_NONE:
> +        memset(pitch, 0, sizeof(pitch[0]) * frame_descs[bd_idx].n_blocks);
>   

If it is unused, is it really needed to waste time zero'ing it out?

Besides that I don't have any other comments.

-Vitor