[FFmpeg-devel] [PATCH] WMA Voice decoder

Wed Jan 20 18:32:40 CET 2010

"Ronald S. Bultje" <rsbultje at gmail.com> writes:

> Hi,
>
> my first decoder, please be kind. :-).
>
> Ronald
>
> Index: ffmpeg-svn/libavcodec/Makefile
> ===================================================================
> --- ffmpeg-svn.orig/libavcodec/Makefile	2010-01-11 12:01:17.000000000 -0500
> +++ ffmpeg-svn/libavcodec/Makefile	2010-01-19 19:30:17.000000000 -0500
> @@ -330,6 +330,7 @@
>  OBJS-$(CONFIG_WMAV1_ENCODER)           += wmaenc.o wma.o
>  OBJS-$(CONFIG_WMAV2_DECODER)           += wmadec.o wma.o
>  OBJS-$(CONFIG_WMAV2_ENCODER)           += wmaenc.o wma.o
> +OBJS-$(CONFIG_WMAVOICE_DECODER)        += wmavoice.o
>  OBJS-$(CONFIG_WMV2_DECODER)            += wmv2dec.o wmv2.o        \
>                                            msmpeg4.o msmpeg4data.o \
>                                            intrax8.o intrax8dsp.o
> Index: ffmpeg-svn/libavcodec/allcodecs.c
> ===================================================================
> --- ffmpeg-svn.orig/libavcodec/allcodecs.c	2010-01-11 12:01:16.000000000 -0500
> +++ ffmpeg-svn/libavcodec/allcodecs.c	2010-01-11 12:03:28.000000000 -0500
> @@ -246,6 +246,7 @@
>      REGISTER_DECODER (WMAPRO, wmapro);
>      REGISTER_ENCDEC  (WMAV1, wmav1);
>      REGISTER_ENCDEC  (WMAV2, wmav2);
> +    REGISTER_DECODER (WMAVOICE, wmavoice);
>      REGISTER_DECODER (WS_SND1, ws_snd1);
>
>      /* PCM codecs */

These bits are OK.

> +/**
> + * Description of frame types.
> + */
> +static const struct frame_type_desc {
> +    short   n_blocks,     ///< amount of blocks per frame
> +                          ///< (each block contains 160/#n_blocks samples)
> +            acb_type,     ///< Adaptive codebook type in frame/block:
> +                          ///< - 0: fixed codebook with per-block/frame gain,
> +                          ///< - 1: adaptive codebook with per-frame pitch,
> +                          ///< - 2: adaptive codebook with per-block pitch
> +            fcb_type,     ///< Fixed codebook type in frame/block:
> +                          ///< -   0: hardcoded codebook, per-frame gain,
> +                          ///< -   1: hardcoded codebook, per-block gain,
> +                          ///< -   2: pitch-adaptive window (AW) pulse signal,
> +                          ///< - 4-6: innovation (fixed) codebook pulses
> +            dbl_pulses,   ///< how many pulse vectors have pulse pairs
> +                          ///< (rather than just one single pulse)
> +                          ///< only if #fcb_type >= 4 && <= 6
> +            frame_size;   ///< the amount of bits that make up the block
> +                          ///< data (per frame)
> +} frame_descs[17] = {
> +    { 1, 0, 0, 0,   0 },
> +    { 2, 0, 1, 0,  28 },
> +    { 2, 1, 2, 0,  46 },
> +    { 2, 1, 4, 2,  80 },
> +    { 2, 1, 4, 5, 104 },
> +    { 4, 1, 5, 0, 108 },
> +    { 4, 1, 5, 2, 132 },
> +    { 4, 1, 5, 5, 168 },
> +    { 2, 2, 4, 0,  64 },
> +    { 2, 2, 4, 2,  80 },
> +    { 2, 2, 4, 5, 104 },
> +    { 4, 2, 5, 0, 108 },
> +    { 4, 2, 5, 2, 132 },
> +    { 4, 2, 5, 5, 168 },
> +    { 8, 2, 6, 0, 176 },
> +    { 8, 2, 6, 2, 208 },
> +    { 8, 2, 6, 5, 256 }
> +};

I suggest splitting the struct declaration from the frame_descs[]
definition.  What you have there looks a little odd to me, though it
will of course work correctly.

[...]

> +/**
> + * Sets up the variable bit mode (VBM) tree from container extradata.
> + * @param s  WMA Voice decoding context.
> + *           The bit context (s->gb) should be loaded with byte 23-46 of the
> + *           container extradata (i.e. the ones containing the VBM tree).
> + * @return 0 on success, <0 on error.
> + */
> +static int
> +decode_vbmtree(WMAVoiceContext *s)
> +{
> +    GetBitContext *gb = &s->gb;
> +    unsigned int cntr[8], n, res;
> +
> +    memset(s->vbm_tree, (uint8_t) -1, sizeof(s->vbm_tree));

Useless cast.

> +    memset(cntr, 0, sizeof(cntr));
> +    for (n = 0; n < 17; n++) {
> +        res = get_bits(gb, 3);
> +        if (cntr[res] >= 3 + (res == 7))
> +            return -1;
> +        s->vbm_tree[res * 3 + cntr[res]++] = n;
> +    }
> +
> +    return 0;
> +}

If this, as the comment suggests, is used only during setup, make it
av_cold.

> +/**
> + * Initialize decoder.
> + */
> +static int
> +wmavoice_decode_init(AVCodecContext *ctx)

av_cold

[...]

> +/**
> + * Read an integer coded as a variable-bit number.
> + * @param gb bit I/O context
> + */
> +static int
> +get_vbm_bits(GetBitContext *gb)
> +{
> +    int n, res;
> +
> +    for (n = 0; ; n++) {
> +        res = get_bits(gb, 2);
> +        if (res < 3 || n == 6 /** don't increase n to 7 */)
> +            break;
> +    }
> +
> +    return 3 * n + res;
> +}

Is this called a lot?  If yes, it can be optimised.

> +/**
> + * Dequantize LSPs
> + * @param lsps pointer to an array of LSPs, holding at least @num values
> + * @param num number of LSPs to be dequantized
> + * @param values quantized values, contains @n_stages values
> + * @param sizes range (well, max. value) of each quantized value in @values
> + * @param n_stages number of dequantization runs
> + * @param table dequantization table to be used
> + * @param mul_q LSF multiplier
> + * @param base_q base (lowest) LSF values
> + */
> +static void
> +dequant_lsps(double *lsps, int num,
> +             const uint16_t *values, const uint16_t *sizes, int n_stages,
> +             const uint8_t *table, const double *mul_q, const double *base_q)
> +{
> +    int n, m;
> +
> +    for (n = 0; n < num; n++) lsps[n] = 0.0;
> +    for (n = 0; n < n_stages; table += sizes[n++] * num) {
> +        for (m = 0; m < num; m++)
> +            lsps[m] += base_q[n] + mul_q[n] * table[m + values[n] * num];
> +    }
> +}

Does this really need double precision?  Certainly the inputs could be
single-precision even if the output needs more.  The input values it
is called with look like single should be enough.

Another thing worth trying is storing the tables as the top 16 bits of
floats directly (the low 16 bits are all zeros for values up to 256).
While this doubles the table size, it avoids the int to float
conversion step.

This might use a bit of simd too.

Is there any pattern in those tables that could be exploited?

> +/**
> + * @defgroup lsp_dequant LSP dequantization routines
> + * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
> + * @note we assume enough bits are available, caller should check.
> + * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
> + * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
> + * @{
> + */
> +/**
> + * Parse 10 independently-coded LSPs.
> + */
> +static void
> +dequant_lsp10i(GetBitContext *gb, double *lsps)
> +{
> +    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
> +    static const double mul_lsf[4] = {
> +        5.2187144800e-3,    1.4626986422e-3,
> +        9.6179549166e-4,    1.1325736225e-3
> +    }, base_lsf[4] = {
> +        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
> +        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
> +    };

Please split that into two separate declarations.  That just looks
weird.

> +    uint16_t v[4];
> +
> +    v[0] = get_bits(gb, 8);
> +    v[1] = get_bits(gb, 6);
> +    v[2] = get_bits(gb, 5);
> +    v[3] = get_bits(gb, 5);
> +
> +    dequant_lsps(lsps, 10, v, vec_sizes, 4, ff_wmavoice_dq_lsp10i,
> +                 mul_lsf, base_lsf);
> +}

[...]

> +/**
> + * Apply second set of pitch-adaptive window pulses.
> + * @param s WMA Voice decoding context private data
> + * @param gb bit I/O context
> + * @param block_idx block index in frame [0, 1]
> + * @param pitch pitch for this block
> + * @param out target vector to apply pulses to
> + * @param size size of @out vector
> + */
> +static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
> +                          int block_idx, int pitch, float *out, int size)
> +{
> +    short arr2[32];
> +    unsigned int arr1[3];
> +    int pulse_off = s->aw_first_pulse_off[block_idx],
> +        pulse_start, n, m, idx, range;
> +    float v;
> +
> +    assert(size == MAX_FRAMESIZE / 2);
> +    if (pulse_off != NO_OFFSET)
> +        while (pulse_off + s->aw_pitch_range < 1) pulse_off += pitch;
> +
> +    if (s->aw_n_pulses[0]) {
> +        if (block_idx == 0) {
> +            range = 32;
> +        } else { ///< block_idx == 1
> +            range = 8;
> +            if (pulse_off != NO_OFFSET) pulse_off = s->aw_next_pulse_off_cache;
> +        }
> +    } else {
> +        range = 16;
> +    }
> +    pulse_start = pulse_off != NO_OFFSET ? pulse_off - range / 2 : 0;
> +
> +    arr1[0] = arr1[1] = arr1[2] = -1;
> +#define BIT_IS_SET(idx) arr1[idx >> 5] & (1 << (idx & 31))
> +#define UNSET_BIT(idx)  arr1[idx >> 5] &= ~(1 << (idx & 31))

It's good practice to undef such macros at the end of the function.

> +    memset(arr2, 0, sizeof(arr2));
> +
> +    if (pulse_off != NO_OFFSET) for (n = 0; n < 11; n++) {
> +        m = pulse_off + n * pitch;
> +        for (idx = m; idx < m + s->aw_pitch_range; idx++)
> +            if (idx >= 0 && idx < size) UNSET_BIT(idx);
> +    }
> +
> +    for (n = 0, m = 0; m < 500 && n < range; pulse_start++, m++) {
> +        for (idx = pulse_start; idx < 0; idx += pitch);
> +        if (idx >= size) {
> +            for (idx = 0; idx < size; idx++)
> +                if (BIT_IS_SET(idx)) break;
> +            if (idx >= size) continue;
> +        }
> +        if (BIT_IS_SET(idx)) {
> +            arr2[n++] = idx;
> +            UNSET_BIT(idx);
> +        }
> +    }
> +
> +    idx = get_bits(gb, s->aw_n_pulses[0] ? 5 - 2 * block_idx : 4);
> +    v   = get_bits1(gb) ? -1.0 : 1.0;
> +    for (n = arr2[idx]; n < size; n += pitch)
> +        out[n] += v;
> +
> +    s->aw_next_pulse_off_cache = n - size; ///< relative to start of block
> +}

[...]

> +/**
> + * @}
> + *
> + * Generate a random number that is different for each frame/block
> + * and can be used as an index for a table with 1000 entries, if
> + * we want to read @block_size entries following.
> + *
> + * @param frame_cntr current frame number
> + * @param block_num current block index
> + * @param block_size amount of entries we want to read from a table
> + *                   that has 1000 entries
> + * @returns a unique random number for each @block_cntr/@block_num
> + *          combination, which can be used as index in a table that
> + *          has a 1000 entries from which we want to read @block_size
> + *          entries.
> + */
> +
> +static int
> +pRNG(int frame_cntr, int block_num, int block_size)
> +{
> +    int x = (block_num * 1877 + frame_cntr) % 0xFFFF;
> +    int y = (x % 9) * 5 + 6;
> +    int z = (uint16_t) (x * 49995 / y);
> +    return z % (1000 - block_size);
> +}
> +
> +/**
> + * Parse hardcoded signal for a single block.
> + * @note see synth_block().
> + */
> +
> +static void synth_block_hardcoded(AVCodecContext *ctx, GetBitContext *gb,
> +                                 int block_idx, int size,
> +                                 const struct frame_type_desc *frame_desc,
> +                                 float *excitation)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    float gain;
> +    int n, r_idx;
> +
> +    assert(size <= MAX_FRAMESIZE);
> +
> +    /**
> +     * For acb_type==0, this is where we set up the index to start reading
> +     * from @std_codebook from.
> +     */
> +    switch (frame_desc->fcb_type) {
> +        case 0: { ///< silence
> +            r_idx = pRNG(s->frame_cntr, block_idx, size);
> +            gain  = s->silence_gain;
> +            break;
> +        }
> +
> +        case 1: { ///< explicit (hardcoded) codebook signal
> +            r_idx = get_bits(gb, 8);
> +            gain  = ff_wmavoice_gain_universal[get_bits(gb, 6)];
> +            break;
> +        }
> +    }
> +
> +    /**
> +     * Clear gain prediction parameters.
> +     */
> +    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
> +
> +    /**
> +     * Apply gain to hardcoded codebook and use that as excitation signal.
> +     */
> +    for (n = 0; n < size; n++)
> +        excitation[n] = ff_wmavoice_std_codebook[r_idx + n] * gain;

We should have some simd for this loop.  If nothing suitable exists
already, something can be added.

> +}
> +
> +/**
> + * Parse FCB/ACB signal for a single block.
> + * @note see synth_block().
> + */
> +
> +static void synth_block_fcb_acb(AVCodecContext *ctx, GetBitContext *gb,
> +                                int block_idx, int size,
> +                                int block_pitch_sh2,
> +                                const struct frame_type_desc *frame_desc,
> +                                float *excitation)
> +{
> +    static const float gain_coeff[6] = {
> +        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
> +    };
> +    WMAVoiceContext *s = ctx->priv_data;
> +    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
> +    int n, idx;
> +
> +    assert(size <= MAX_FRAMESIZE / 2);
> +
> +    /**
> +     * For the other frame types, this is where we apply the innovation
> +     * (fixed) codebook pulses of the speech signal.
> +     */
> +    switch (frame_desc->fcb_type) {
> +        case 2: ///< pitch-adapting window (AW) fixed codebook
> +            aw_pulse_set1(s, gb, block_idx, block_pitch_sh2 >> 2,
> +                          pulses, size);
> +
> +            aw_pulse_set2(s, gb, block_idx, block_pitch_sh2 >> 2,
> +                          pulses, size);
> +            break;
> +
> +        case 4: case 5: case 6: { ///< innovation (fixed) codebook
> +            int sh = 8 - frame_desc->fcb_type; ///< 4:4/5:3/6:2
> +
> +            memset(pulses, 0, sizeof(float) * size);
> +            for (n = 0; n < 5; n++) {
> +                float pulse = get_bits1(gb) ? 1.0 : -1.0;
> +                int idx1, idx2;
> +
> +                idx1 = get_bits(gb, sh);
> +                pulses[n + 5 * idx1] += pulse;
> +                if (n < frame_desc->dbl_pulses) {
> +                    idx2 = get_bits(gb, sh);
> +                    pulses[n + 5 * idx2] +=
> +                        (idx1 >= idx2) ? pulse : -pulse;
> +                }
> +            }

This can be optimised if it matters.

> +            break;
> +        }
> +    }
> +
> +    /**
> +     * Calculate gain for adaptive & fixed codebook signal.
> +     * @note see ff_amr_set_fixed_gain().
> +     */
> +    idx = get_bits(gb, 7);
> +    fcb_gain = ff_wmavoice_gain_codebook_fcb[idx] *
> +               expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
> +                    5.2409161640);
> +    acb_gain = ff_wmavoice_gain_codebook_acb[idx];
> +    pred_err = logf(av_clipf(ff_wmavoice_gain_codebook_fcb[idx], 0.05, 5.0));
> +
> +    if (frame_desc->n_blocks > 1)
> +        memmove(&s->gain_pred_err[8 / frame_desc->n_blocks],
> +                s->gain_pred_err,
> +                sizeof(float) * (6 - 8 / frame_desc->n_blocks));
> +    for (n = 0; n < 8 / frame_desc->n_blocks; n++)
> +        s->gain_pred_err[n] = pred_err;
> +
> +    /**
> +     * Calculation of adaptive codebook.
> +     */
> +    switch (frame_desc->acb_type) {
> +        case 1: { ///< adaptive codebook
> +            for (n = 0; n < size; n++) {
> +                float v;
> +                int pitch_sh8 = (s->last_pitch_val << 8) +
> +                    ((s->pitch_diff_sh16 * (block_idx * size + n)) >> 8);
> +                int pitch = (pitch_sh8 + 0x80) >> 8,
> +                    idx = (((pitch << 8) - pitch_sh8) * 8 + 0x480) >> 8, m;
> +                pitch -= idx >> 3;
> +                idx &= 7;
> +                for (v = 0., m = 16; m >= 0; m--) {
> +                    v += ff_wmavoice_ipol1_coeffs[idx][m] *
> +                         excitation[n + m - pitch - 8];
> +                }
> +                excitation[n] = v;
> +            }
> +            break;
> +        }
> +        case 2: { ///< adaptive codebook
> +            int block_pitch = block_pitch_sh2 >> 2, idx = block_pitch_sh2 & 3;
> +            if (idx--) {
> +                for (n = 0; n < size; n++) {
> +                    float v;
> +                    int m;
> +
> +                    for (v = 0., m = 0; m < 16; m++) {
> +                        v += ff_wmavoice_ipol2_coeffs[idx][m] *
> +                             excitation[m + n - 8 - block_pitch];
> +                    }
> +                    excitation[n] = v;
> +                }
> +            } else {
> +                for (n = 0; n < size; n++)
> +                    excitation[n] = excitation[n - block_pitch];

What are possible values for block_pitch?

> +            }
> +            break;
> +        }
> +    }
> +
> +    /**
> +     * Interpolate ACB/FCB and use as excitation signal.
> +     */
> +    ff_weighted_vector_sumf(excitation, excitation, pulses,
> +                            acb_gain, fcb_gain, size);

This looks like a simd candidate.  A lot of those celp functions do.

> +}
> +
> +/**
> + * Parse data in a single block.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoding context
> + * @param gb bit I/O context
> + * @param block_idx index of the to-be-read block
> + * @param size amount of samples to be read in this block
> + * @param block_pitch_sh2 pitch for this block << 2
> + * @param lsps LSPs for (the end of) this frame
> + * @param pre_lsps LSPs for the last frame
> + * @param frame_desc frame type descriptor
> + * @param excitation target memory for the ACB+FCB interpolated signal
> + * @param synth target memory for the speech synthesis filter output
> + * @return 0 on success, <0 on error.
> + */
> +
> +static void synth_block(AVCodecContext *ctx, GetBitContext *gb,
> +                        int block_idx, int size,
> +                        int block_pitch_sh2,
> +                        const double *lsps, const double *prev_lsps,
> +                        const struct frame_type_desc *frame_desc,
> +                        float *excitation, float *synth)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    double i_lsps[MAX_LSPS];
> +    float lpcs[MAX_LSPS];
> +    float fac;
> +    int n;
> +
> +    if (frame_desc->acb_type == 0)
> +        synth_block_hardcoded(ctx, gb, block_idx, size, frame_desc, excitation);
> +    else
> +        synth_block_fcb_acb(ctx, gb, block_idx, size, block_pitch_sh2,
> +                            frame_desc, excitation);
> +
> +    /** convert interpolated LSPs to LPCs */
> +    fac = (block_idx + 0.5) / frame_desc->n_blocks;
> +    for (n = 0; n < s->lsps; n++) ///< LSF -> LSP
> +        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));

Calling trig functions is generally bad.  Perhaps it can't be avoided
here though.

> +    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
> +
> +    /**
> +     * Speech synthesis.
> +     */
> +    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
> +}
> +
> +/**
> + * Synthesize output samples for a single frame.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoder context
> + * @param gb bit I/O context (s->gb or one for cross-packet superframes)
> + * @param data pointer to output sample buffer, has space for at least 160
> + *             samples
> + * @param lsps LSP array
> + * @param prev_lsps array of previous frame's LSPs
> + * @return 0 on success, <0 on error.
> + */
> +static int synth_frame(AVCodecContext *ctx, GetBitContext *gb,
> +                       float *samples,
> +                       const double *lsps, const double *prev_lsps,
> +                       float *excitation, float *synth)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    int n;
> +    short pitch[MAX_BLOCKS], last_block_pitch;
> +
> +    /**
> +     * Parse frame type ("frame header"), see #frame_descs.
> +     */
> +    int bd_idx = s->vbm_tree[get_vbm_bits(gb)],
> +        block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
> +
> +    /**
> +     * Pitch (per frame type):
> +     * - type 0: unused
> +     * - type 1: provided (globally) for the whole frame. In #synth_block(),
> +     *            we derive the "pitch-per-sample" for adaptive codebook
> +     *            reading.
> +     * - type 2: provided per block (see just before the call to
> +     *            #parse_block()), so not read here.
> +     */
> +    switch (frame_descs[bd_idx].acb_type) {
> +        case 0:
> +            for (n = 0; n < frame_descs[bd_idx].n_blocks; n++)
> +                pitch[n] = 0;
> +            break;

memset?

> +        case 1: {
> +            int n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
> +
> +            s->cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
> +            if (s->last_acb_type == 0 ||
> +                20 * abs(s->cur_pitch_val - s->last_pitch_val) >
> +                    (s->cur_pitch_val + s->last_pitch_val))
> +                s->last_pitch_val = s->cur_pitch_val;
> +
> +            /** pitch per frame/block */
> +            for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
> +                int fac = ((n << 1) | 1);
> +
> +                pitch[n] = (fac                 * s->cur_pitch_val +
> +                            (n_blocks_x2 - fac) * s->last_pitch_val +
> +                            frame_descs[bd_idx].n_blocks) / n_blocks_x2;

These could use MUL16/MAC16.

> +            }
> +
> +            /** pitch per sample */
> +            s->pitch_diff_sh16 =
> +                ((s->cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
> +            break;
> +        }
> +    }
> +
> +    /**
> +     * Global gain (if silence) and pitch-adaptive window coordinates.
> +     */
> +    switch (frame_descs[bd_idx].fcb_type) {
> +        case 0:
> +            s->silence_gain = ff_wmavoice_gain_silence[get_bits(gb, 8)];
> +            break;
> +        case 2:
> +            aw_parse_coords(ctx, gb, pitch);
> +            break;
> +    }
> +
> +    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
> +        int bl_pitch_sh2 = pitch[n] << 2;
> +
> +        /**
> +         * If pitch is given per block, parse that first. Per-block pitches
> +         * are encoded as an absolute value for the first block, and then
> +         * delta values for all subsequent blocks. Unit is different (also
> +         * scale is different), so we do some stupidly complex conversion.
> +         */
> +        if (frame_descs[bd_idx].acb_type == 2) {
> +            int block_pitch,
> +                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
> +                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
> +                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
> +
> +            if (n == 0) {
> +                block_pitch = get_bits(gb, s->block_pitch_nbits);
> +            } else
> +                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
> +                                 get_bits(gb, s->block_delta_pitch_nbits);
> +            /**
> +             * Convert last_ so that any next delta leads to a value within
> +             * _range.
> +             */
> +            last_block_pitch = av_clip(block_pitch,
> +                                       s->block_delta_pitch_hrange,
> +                                       s->block_pitch_range -
> +                                           s->block_delta_pitch_hrange);
> +
> +            /**
> +             * Convert semi-log-style scale back to normal scale.
> +             */
> +            if (block_pitch < t1) {
> +                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
> +            } else {
> +                block_pitch -= t1;
> +                if (block_pitch < t2) {
> +                    bl_pitch_sh2 =
> +                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
> +                } else {
> +                    block_pitch -= t2;
> +                    if (block_pitch < t3) {
> +                        bl_pitch_sh2 =
> +                            (s->block_conv_table[2] + block_pitch) << 2;
> +                    } else {
> +                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
> +                    }
> +                }
> +            }
> +            pitch[n] = bl_pitch_sh2 >> 2;
> +        }
> +
> +        synth_block(ctx, gb, n, block_nsamples, bl_pitch_sh2,
> +                    lsps, prev_lsps, &frame_descs[bd_idx],
> +                    &excitation[n * block_nsamples],
> +                    &synth[n * block_nsamples]);
> +    }

Does gcc split this loop in two, one for acb_type==2 and one for !=2?
If not, consider doing it manually.  That if() block is big that it
probably makes a difference.

> +    /**
> +     * Averaging projection filter, if applicable. Else, just copy samples
> +     * from synthesis buffer.
> +     */
> +    if (s->do_apf) {
> +        // FIXME this is where APF would take place, currently not implemented
> +        av_log_missing_feature(ctx, "APF", 0);
> +        s->do_apf = 0;
> +    } //else
> +        for (n = 0; n < 160; n++)
> +            samples[n] = av_clipf(synth[n], -1.0, 1.0);
> +
> +    /**
> +     * Cache values for next frame.
> +     */
> +    s->frame_cntr      = (s->frame_cntr + 1) % 0xFFFF;
> +    s->last_acb_type = frame_descs[bd_idx].acb_type;
> +    switch (frame_descs[bd_idx].acb_type) {
> +        case 0: s->last_pitch_val = 0;                break;
> +        case 1: s->last_pitch_val = s->cur_pitch_val; break;
> +        case 2: s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
> +                break;
> +    }
> +
> +    return 0;
> +}

[...]

> +/**
> + * Copy (unaligned) bits from gb/data/size to pb.
> + *
> + * @param bp target buffer to copy bits into
> + * @param data source buffer to copy bits from
> + * @param size size of @data
> + * @param gb bit I/O context specifying the current position in @data.
> + *           This function might use this to align the bit position to
> + *           a whole-byte boundary before calling @ff_copy_bits() on
> + *           aligned source data
> + * @param nbits the amount of bits to copy from @gb/@data to @pb
> + */
> +static void copy_bits(PutBitContext *pb,
> +                      const uint8_t *data, int size,
> +                      GetBitContext *gb, int nbits)
> +{
> +    int rmn_bytes, rmn_bits;
> +
> +    rmn_bits = rmn_bytes = get_bits_left(gb);
> +    if (rmn_bits < nbits)
> +        return;
> +    rmn_bits &= 7; rmn_bytes >>= 3;
> +    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
> +        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
> +    ff_copy_bits(pb, data + size - rmn_bytes,
> +                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
> +}

This should probably go somewhere more generic.

> +/**
> + * Packet decoding: a packet is anything that the (ASF) demuxer contains,
> + * and we expect that the demuxer / application provides it to us as such
> + * (else you'll probably get garbage as output). Every packet has a size of
> + * s->block_align bytes, starts with a packet header (see
> + * #parse_packet_header()), and then a series of superframes. Superframe
> + * boundaries may exceed packets, i.e. superframes can split data over
> + * multiple (two) packets.
> + *
> + * For more information about frames, see #synth_superframe().
> + */
> +static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
> +                                  int *data_size, AVPacket *avpkt)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    GetBitContext *gb = &s->gb;
> +    int size, res, pos;
> +
> +    if (*data_size < 480 * sizeof(float)) {
> +        av_log(ctx, AV_LOG_ERROR,
> +               "Output buffer too small (%d given - %lu needed)\n",
> +               *data_size, 480 * sizeof(float));
> +        return -1;
> +    }
> +    *data_size = 0;
> +
> +    /**
> +     * Packets are sometimes a multiple of s->block_align, with a packet
> +     * header at each s->block_align bytes. However, FFmpeg's ASF demuxer
> +     * feeds us ASF packets, which may concatenate multiple "codec" packets
> +     * in a single "muxer" packet, so we artificially emulate that by
> +     * capping the packet size at s->block_align.
> +     */
> +    for (size = avpkt->size; size > s->block_align; size -= s->block_align);
> +    if (!size)
> +        return 0;
> +    init_get_bits(&s->gb, avpkt->data, size << 3);
> +
> +    /**
> +     * size == s->block_align is used to indicate whether we are dealing with
> +     * a new packet or a packet of which we already read the packet header
> +     * previously.
> +     */
> +    if (size == s->block_align) { ///< new packet header
> +        if ((res = parse_packet_header(s)) < 0)
> +            return res;
> +
> +        /**
> +         * If the packet header specifies a s->spillover_nbits, then we want
> +         * to push out all data of the previous packet (+ spillover) before
> +         * continuing to parse new superframes in the current packet.
> +         */
> +        if (s->spillover_nbits > 0) {
> +            if (s->cache_sframe_size > 0) {
> +                int cnt = get_bits_count(gb);
> +                copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
> +                flush_put_bits(&s->pb);
> +                s->cache_sframe_size += s->spillover_nbits;
> +                if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
> +                    *data_size > 0) {
> +                    cnt += s->spillover_nbits;
> +                    s->skip_bits_next = cnt & 7;
> +                    return cnt >> 3;
> +                } else
> +                    skip_bits_long (gb, s->spillover_nbits - cnt +
> +                                    get_bits_count(gb)); ///< resync
> +            } else
> +                skip_bits_long(gb, s->spillover_nbits);  ///< resync
> +        }
> +    } else if (s->skip_bits_next)
> +        skip_bits(gb, s->skip_bits_next);
> +
> +    /**
> +     * Try parsing superframes in current packet.
> +     */
> +    s->cache_sframe_size = 0;
> +    s->skip_bits_next = 0;
> +    pos = get_bits_left(gb);
> +    if ((res = synth_superframe(ctx, data, data_size)) < 0) {
> +        return res;
> +    } else if (*data_size > 0) {
> +        int cnt = get_bits_count(gb);
> +        s->skip_bits_next = cnt & 7;
> +        return cnt >> 3;
> +    } else if ((s->cache_sframe_size = pos) > 0) {
> +        /** rewind bit reader to start of last (incomplete) superframe... */
> +        init_get_bits(gb, avpkt->data, size << 3);
> +        skip_bits_long(gb, (size << 3) - pos);
> +        assert(get_bits_left(gb) == pos);
> +
> +        /** ...and cache it for spillover in next packet */
> +        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_SIZE);
> +        copy_bits(&s->pb, avpkt->data, size, gb, s->cache_sframe_size);
> +        ///< FIXME bad - just copy bytes as whole and add use the
> +        ///< skip_bits_next field
> +    }
> +
> +    return size;
> +}
> +
> +AVCodec wmavoice_decoder = {
> +    "wmavoice",
> +    CODEC_TYPE_AUDIO,
> +    CODEC_ID_WMAVOICE,
> +    sizeof(WMAVoiceContext),
> +    wmavoice_decode_init,
> +    NULL,
> +    NULL,
> +    wmavoice_decode_packet,
> +    CODEC_CAP_SUBFRAMES,
> +    .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
> +};
> Index: ffmpeg-svn/libavcodec/wmavoice_data.h
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ ffmpeg-svn/libavcodec/wmavoice_data.h	2010-01-19 15:28:36.000000000 -0500
> @@ -0,0 +1,2970 @@
> +/*
> + * Windows Media Voice (WMAVoice) tables.
> + * Copyright (c) 2009 Ronald S. Bultje
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file libavcodec/wmavoice_data.h
> + * @brief Windows Media Voice (WMAVoice) tables
> + * @author Ronald S. Bultje <rsbultje at gmail.com>
> + */

These tables are huge.  Is there any structure in them that could be
used to reduce the size?

-- 
M?ns Rullg?rd
mans at mansr.com