[FFmpeg-devel] [PATCH] WMA Voice decoder

Sat Jan 23 06:06:22 CET 2010

Ronald S. Bultje wrote:
> Hi,
> 
> On Tue, Jan 19, 2010 at 7:38 PM, Ronald S. Bultje <rsbultje at gmail.com> wrote:
>> my first decoder, please be kind. :-).

Some comments (as kind as ffmpeg review tradition allows) ;-)

> + */
> +static const struct frame_type_desc {
> +    short   n_blocks,     ///< amount of blocks per frame
> +                          ///< (each block contains 160/#n_blocks samples)
> +            acb_type,     ///< Adaptive codebook type in frame/block:
> +                          ///< - 0: fixed codebook with per-block/frame gain,
> +                          ///< - 1: adaptive codebook with per-frame pitch,
> +                          ///< - 2: adaptive codebook with per-block pitch
> +            fcb_type,     ///< Fixed codebook type in frame/block:
> +                          ///< -   0: hardcoded codebook, per-frame gain,

fcb_type == 0 is silence + confort noise, no? Also I think those should 
be an enum...

> +    int frequency_domain;         ///< defines which table to use during APF
> +                                  ///< frequency domain filtering [0-7]
> +    int spectrum_corr;            ///< whether to do spectrum tilt correction
> +                                  ///< in APF
> +    int apf_project_mode;         ///< defines the filter projection mode in
> +                                  ///< APF [0-7]

I think it is better to leave those for the patch adding APF.

> +    uint8_t vbm_tree[25];         ///< variable bit mode coding tree
> +    int history_nsamples;         ///< number of samples in history for signal
> +                                  ///< prediction (through ACB)
> +
> +    int lsps;                     ///< number of LSPs per frame [10 or 16]
> +    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
> +    int lsp_def_mode;             ///< defines different sets of LSP defaults
> +                                  ///< [0, 1]
> +    int frame_lsp_bitsize;        ///< size (in bits) of LSPs, when encoded
> +                                  ///< per-frame (independent coding)
> +    int sframe_lsp_bitsize;       ///< size (in bits) of LSPs, when encoded
> +                                  ///< per superframe (residual coding)
> +
> +    int min_pitch_val;            ///< base value for pitch parsing code
> +    int max_pitch_val;            ///< max value + 1 for pitch parsing
> +    int pitch_nbits;              ///< number of bits used to specify the
> +                                  ///< pitch value in the frame header
> +    int block_pitch_nbits;        ///< number of bits used to specify the
> +                                  ///< first block's pitch value
> +    int block_pitch_range;        ///< range of the block pitch
> +    int block_delta_pitch_nbits;  ///< number of bits used to specify the
> +                                  ///< delta pitch between this and the last
> +                                  ///< block's pitch value, used in all but
> +                                  ///< first block
> +    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
> +                                  ///< from -this to +this-1)
> +    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
> +                                  ///< conversion
> +
> +    /**
> +     * @}
> +     * @defgroup struct_packet Packet values
> +     * Packet values, specified in the packet header or related to a packet.
> +     * A packet is considered to be a single unit of data provided to this
> +     * decoder by the demuxer.
> +     * @{
> +     */
> +    int spillover_nbits;          ///< number of bits of the previous packet's
> +                                  ///< last superframe preceeding this
> +                                  ///< packet's first full superframe (useful
> +                                  ///< for re-synchronization also)
> +    int has_residual_lsps;        ///< if set, superframes contain one set of
> +                                  ///< LSPs that cover all frames, encoded as
> +                                  ///< independent and residual LSPs; if not
> +                                  ///< set, each frame contains its own, fully
> +                                  ///< independent, LSPs
> +    int skip_bits_next;           ///< number of bits to skip at the next call
> +                                  ///< to #wmavoice_decode_packet() (since
> +                                  ///< they're part of the previous superframe)
> +
> +    uint8_t sframe_cache[SFRAME_CACHE_SIZE + FF_INPUT_BUFFER_PADDING_SIZE];
> +                                  ///< cache for superframe data split over
> +                                  ///< multiple packets
> +    PutBitContext pb;             ///< points into #sframe_cache
> +    int cache_sframe_size;        ///< set to >0 if we have data from an
> +                                  ///< (incomplete) superframe from a previous
> +                                  ///< packet that spilled over in the current
> +                                  ///< packet; specifies the amount of bits in
> +                                  ///< #sframe_cache
> +
> +    /**
> +     * @}
> +     * @defgroup struct_frame Frame and superframe values
> +     * Superframe and frame data - these can change from frame to frame,
> +     * although some of them do in that case serve as a cache / history for
> +     * the next frame or superframe.
> +     * @{
> +     */
> +    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
> +                                  ///< superframe

> +    int cur_pitch_val;            ///< pitch value of the current frame

can be a local var

> +    int last_pitch_val;           ///< pitch value of the previous frame
> +    int last_acb_type;            ///< frame type [0-2] of the previous frame
> +    int pitch_diff_sh16;          ///< ((#cur_pitch_val - #last_pitch_val)
> +                                  ///< << 16) / #MAX_FRAME_SIZE

> +    float silence_gain;           ///< set for use in blocks if acb_type == 0

Do not need to be on the context, but I don't know if making this local 
wouldn't make the code uglier.

> +    int aw_idx_is_ext;            ///< whether the AW index was encoded in
> +                                  ///< 8 bits (instead of 6)
> +    int aw_pitch_range;
> +    short aw_n_pulses[2];         ///< number of AW-pulses in each block
> +    short aw_first_pulse_off[2];  ///< index of first sample to which to
> +                                  ///< apply AW-pulses, or -0xff if unset
> +    int aw_next_pulse_off_cache;
> +

> +    int frame_cntr;               ///< current frame index [0 - 0xFFFF]

Please add to the doxy that this is only used for prng

> +    float gain_pred_err[6];       ///< cache for future gain prediction
> +    float excitation_history[MAX_SIGNAL_HISTORY];
> +                                  ///< cache of the signal of previous
> +                                  ///< superframes, used as a history for
> +                                  ///< future signal generation
> +    float synth_history[MAX_LSPS]; ///< see #excitation_history
> +    /**
> +     * @}
> +     */
> +} WMAVoiceContext;
> +
> +/**
> + * Sets up the variable bit mode (VBM) tree from container extradata.
> + * @param s  WMA Voice decoding context.
> + *           The bit context (s->gb) should be loaded with byte 23-46 of the
> + *           container extradata (i.e. the ones containing the VBM tree).
> + * @return 0 on success, <0 on error.
> + */
> +static av_cold int decode_vbmtree(WMAVoiceContext *s)
> +{
> +    GetBitContext *gb = &s->gb;
> +    unsigned int cntr[8], n, res;
> +
> +    memset(s->vbm_tree, -1, sizeof(s->vbm_tree));
> +    memset(cntr,         0, sizeof(cntr));
> +    for (n = 0; n < 17; n++) {
> +        res = get_bits(gb, 3);
> +        if (cntr[res] >= 3 + (res == 7))
> +            return -1;
> +        s->vbm_tree[res * 3 + cntr[res]++] = n;
> +    }
> +
> +    return 0;
> +}

I'd prefer if you pass a GetBitContext and s->vbm_tree as parameters. 
Also, it looks reasonable to me to make the GetBitContext of 
decoder_init() a local var, to avoid having s->gb meaning pointing to 
semantically different things (extradata on init and frame_data in 
decoding).

> +/**
> + * Initialize decoder.
> + */
> +static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
> +{
> +    int n, flags, pitch_range;
> +    WMAVoiceContext *s = ctx->priv_data;
> +
> +    /**
> +     * Extradata layout:
> +     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
> +     * - byte 19-22: flags field (annoyingly in LE; see below for known
> +     *               values),

Hmm, isn't the endianness of the flags a convention you can choose?

> +     * - byte 23-46: variable bitmode tree (really just 25 * 3 bits,
> +     *               rest is 0).
> +     */
> +    if (ctx->extradata_size != 0x2E) {
> +        av_log(ctx, AV_LOG_ERROR,
> +               "Invalid extradata size 0x%x != 0x2E\n", ctx->extradata_size);
> +        return -1;
> +    }
> +    flags                = AV_RL32(ctx->extradata + 18);
> +    s->block_align       = ctx->block_align;
> +    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
> +    s->do_apf            =  flags & 0x1;
> +    s->frequency_domain  = (flags >> 2) & 0xF;
> +    s->spectrum_corr     =  flags & 0x40;
> +    s->apf_project_mode  = (flags >> 7) & 0xF;
> +    s->lsp_q_mode        =  flags & 0x2000;
> +    s->lsp_def_mode      =  flags & 0x4000;
> +    if (flags & 0x1000) {
> +        s->lsps               = 16;
> +        s->frame_lsp_bitsize  = 34;
> +        s->sframe_lsp_bitsize = 60;
> +    } else {
> +        s->lsps               = 10;
> +        s->frame_lsp_bitsize  = 24;
> +        s->sframe_lsp_bitsize = 48;
> +    }
> +    for (n = 0; n < s->lsps; n++)
> +        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
> +
> +    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
> +    if (decode_vbmtree(s) < 0) {
> +        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree\n");
> +        return -1;
> +    }

I think a "Invalid metadata" or "Invalid VBM tree, broken metadata?" 
could be more handy for someone debugging a demuxer.

> +    s->min_pitch_val    = ((int) ((ctx->sample_rate << 8) * 0.0025) + 50) >> 8;
> +    s->max_pitch_val    = ((int) ((ctx->sample_rate << 8) * 0.0185) + 50) >> 8;

I think this is better done with integer math

> +    pitch_range         = s->max_pitch_val - s->min_pitch_val;
> +    s->pitch_nbits      = av_ceil_log2(pitch_range);
> +    s->last_pitch_val   = 40;
> +    s->last_acb_type    = 0;
> +    s->history_nsamples = s->max_pitch_val + 8;
> +    if (s->history_nsamples > MAX_SIGNAL_HISTORY) {
> +        av_log(ctx, AV_LOG_ERROR, "Signal history too big: %d (max=%d), probably broken file\n",
> +               s->history_nsamples, MAX_SIGNAL_HISTORY);

"Unsupported sample rate: %d"?

> +        return -1;
> +    }
> +
> +    s->block_conv_table[0]      = s->min_pitch_val;
> +    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
> +    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
> +    s->block_conv_table[3]      = s->max_pitch_val - 1;
> +    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
> +    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
> +    s->block_pitch_range        = s->block_conv_table[2] +
> +                                  s->block_conv_table[3] + 1 +
> +                                  2 * (s->block_conv_table[1] -
> +                                       (2 * s->min_pitch_val));
> +    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
> +
> +    ctx->sample_fmt = SAMPLE_FMT_FLT;
> +
> +    return 0;
> +}
> +
> +/**
> + * Read an integer coded as a variable-bit number.
> + * @param gb bit I/O context
> + */
> +static int get_vbm_bits(GetBitContext *gb)
> +{
> +    int n, res;
> +
> +    for (n = 0; ; n++) {
> +        res = get_bits(gb, 2);
> +        if (res < 3 || n == 6 /** don't increase n to 7 */)
> +            break;
> +    }
> +
> +    return 3 * n + res;
> +}

I've never actually used the {init,get}_vlc() functions, so I'm not 
sure, but it looks like they could be useful here.

> +/**
> + * Dequantize LSPs
> + * @param lsps pointer to an array of LSPs, holding at least @num values
> + * @param num number of LSPs to be dequantized
> + * @param values quantized values, contains @n_stages values
> + * @param sizes range (well, max. value) of each quantized value in @values
> + * @param n_stages number of dequantization runs
> + * @param table dequantization table to be used
> + * @param mul_q LSF multiplier
> + * @param base_q base (lowest) LSF values
> + */
> +static void dequant_lsps(double *lsps, int num,
> +                         const uint16_t *values, const uint16_t *sizes,
> +                         int n_stages, const uint8_t *table,
> +                         const double *mul_q, const double *base_q)
> +{
> +    int n, m;
> +
> +    for (n = 0; n < num; n++) lsps[n] = 0.0;

memset?

> +    for (n = 0; n < n_stages; table += sizes[n++] * num)
> +        for (m = 0; m < num; m++)
> +            lsps[m] += base_q[n] + mul_q[n] * table[m + values[n] * num];
> +}
> +
> +/**
> + * @defgroup lsp_dequant LSP dequantization routines
> + * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
> + * @note we assume enough bits are available, caller should check.
> + * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
> + * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
> + * @{
> + */
> +/**
> + * Parse 10 independently-coded LSPs.
> + */
> +static void dequant_lsp10i(GetBitContext *gb, double *lsps)
> +{
> +    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
> +    static const double mul_lsf[4] = {
> +        5.2187144800e-3,    1.4626986422e-3,
> +        9.6179549166e-4,    1.1325736225e-3
> +    };
> +    static const double base_lsf[4] = {
> +        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
> +        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
> +    };
> +    uint16_t v[4];
> +
> +    v[0] = get_bits(gb, 8);
> +    v[1] = get_bits(gb, 6);
> +    v[2] = get_bits(gb, 5);
> +    v[3] = get_bits(gb, 5);
> +
> +    dequant_lsps(lsps, 10, v, vec_sizes, 4, ff_wmavoice_dq_lsp10i,
> +                 mul_lsf, base_lsf);
> +}
> +
> +/**
> + * Parse 10 independently-coded LSPs, and then derive the tables to
> + * generate LSPs for the other frames from them (residual coding).
> + */
> +static void dequant_lsp10r(GetBitContext *gb,
> +                           double *i_lsps, const double *old,
> +                           double *a1, double *a2, int q_mode)
> +{
> +    static const uint16_t vec_sizes[3] = { 0x80, 0x40, 0x40 };

I prefer decimal

> +    static const double mul_lsf[3] = {
> +        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
> +    };
> +    static const double base_lsf[3] = {
> +        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
> +    };
> +    const float (*ipol_tab)[2][10] = q_mode ?
> +        ff_wmavoice_lsp10_intercoeff_b : ff_wmavoice_lsp10_intercoeff_a;
> +    uint16_t interpol, v[3];
> +    int n;
> +
> +    dequant_lsp10i(gb, i_lsps);
> +
> +    interpol = get_bits(gb, 5);
> +    v[0]     = get_bits(gb, 7);
> +    v[1]     = get_bits(gb, 6);
> +    v[2]     = get_bits(gb, 6);
> +
> +    for (n = 0; n < 10; n++) {
> +        a1[n]      = ipol_tab[interpol][0][n] * (old[n] - i_lsps[n]) +
> +                     i_lsps[n];
> +        a1[10 + n] = ipol_tab[interpol][1][n] * (old[n] - i_lsps[n]) +
> +                     i_lsps[n];
> +    }
> +
> +    dequant_lsps(a2, 20, v, vec_sizes, 3, ff_wmavoice_dq_lsp10r,
> +                 mul_lsf, base_lsf);
> +}
> +
> +/**
> + * Parse 16 independently-coded LSPs.
> + */
> +static void dequant_lsp16i(GetBitContext *gb, double *lsps)
> +{
> +    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
> +    static const double mul_lsf[5] = {
> +        3.3439586280e-3,    6.9908173703e-4,
> +        3.3216608306e-3,    1.0334960326e-3,
> +        3.1899104283e-3
> +    };
> +    static const double base_lsf[5] = {
> +        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
> +        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
> +        M_PI * -1.29816e-1
> +    };
> +    uint16_t v[5];
> +
> +    v[0] = get_bits(gb, 8);
> +    v[1] = get_bits(gb, 6);
> +    v[2] = get_bits(gb, 7);
> +    v[3] = get_bits(gb, 6);
> +    v[4] = get_bits(gb, 7);
> +
> +    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
> +                 ff_wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
> +    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
> +                 ff_wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
> +    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
> +                 ff_wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
> +}
> +
> +/**
> + * Parse 16 independently-coded LSPs, and then derive the tables to
> + * generate LSPs for the other frames from them (residual coding).
> + */
> +static void dequant_lsp16r(GetBitContext *gb,
> +                           double *i_lsps, const double *old,
> +                           double *a1, double *a2, int q_mode)
> +{
> +    static const uint16_t vec_sizes[3] = { 0x80, 0x80, 0x80 };
> +    static const double mul_lsf[3] = {
> +        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
> +    };
> +    static const double base_lsf[3] = {
> +        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
> +    };
> +    const float (*ipol_tab)[2][16] = q_mode ?
> +        ff_wmavoice_lsp16_intercoeff_b : ff_wmavoice_lsp16_intercoeff_a;
> +    uint16_t interpol, v[3];
> +    int n;
> +
> +    dequant_lsp16i(gb, i_lsps);
> +
> +    interpol = get_bits(gb, 5);
> +    v[0]     = get_bits(gb, 7);
> +    v[1]     = get_bits(gb, 7);
> +    v[2]     = get_bits(gb, 7);
> +
> +    for (n = 0; n < 16; n++) {
> +        a1[n]      = ipol_tab[interpol][0][n] * (old[n] - i_lsps[n]) +
> +                     i_lsps[n];
> +        a1[16 + n] = ipol_tab[interpol][1][n] * (old[n] - i_lsps[n]) +
> +                     i_lsps[n];
> +    }
> +
> +    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
> +                 ff_wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
> +    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
> +                 ff_wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
> +    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
> +                 ff_wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
> +}

Semi-duplicated code, but hard to see how to factor it...

> +/**
> + * @}
> + * @defgroup aw Pitch-adaptive window coding functions
> + * The next few functions are for pitch-adaptive window coding.
> + * @{
> + */
> +#define NO_OFFSET -0xff
> +/**
> + * Parse the offset of the first pitch-adaptive window pulses, and
> + * the distribution of pulses between the two blocks in this frame.
> + * @param ctx WMA Voice decoding context
> + * @param gb bit I/O context
> + * @param pitch pitch for each block in this frame
> + */
> +
> +static void aw_parse_coords(AVCodecContext *ctx, GetBitContext *gb,
> +                            const short *pitch)
> +{
> +    static const uint8_t start_offset[94] = {
> +          0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,
> +         24,  26,  29,  28,  30,  31,  32,  33,  34,  35,  36,  37,
> +         38,  39,  40,  41,  42,  43,  44,  46,  48,  50,  52,  54,
> +         56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,
> +         80,  82,  84,  86,  88,  90,  92,  94,  96,  98, 100, 102,
> +        104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
> +        128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
> +        152, 154, 156, 158, 160, 162, 164, 166, 168, 170
> +    };
> +    WMAVoiceContext *s = ctx->priv_data;
> +    int bits, n, offset;
> +    short off_table[11], first_idx[2];
> +
> +    s->aw_idx_is_ext = 0;
> +    if ((bits = get_bits(gb, 6)) >= 54) {
> +        s->aw_idx_is_ext = 1;
> +        bits += (bits - 54) * 3 + get_bits(gb, 2);
> +    }
> +    s->aw_pitch_range = 16 + 8 * (FFMIN(pitch[0], pitch[1]) > 32);
> +
> +    for (offset = start_offset[bits] - 11,                 n = 0;
> +         offset < MAX_FRAMESIZE + s->aw_pitch_range / 2 && n < 11;
> +         offset += pitch[offset >= MAX_FRAMESIZE / 2],     n++)
> +        off_table[n] = offset;

Please do not write all this in a single statement.

> +    while (n < 11) off_table[n++] = NO_OFFSET;

Line break

> +    s->aw_n_pulses[0]        = s->aw_n_pulses[1]        = 0;
> +    s->aw_first_pulse_off[0] = s->aw_first_pulse_off[1] = NO_OFFSET;
> +    first_idx[0]             = first_idx[1]             = 0;
> +    for (n = 0; n < 11; n++) {
> +        if (off_table[n] >= MAX_FRAMESIZE / 2) {
> +            if (off_table[n] < MAX_FRAMESIZE) { ///< block[1]
> +                if (s->aw_n_pulses[1]++ == 0) {
> +                    s->aw_first_pulse_off[1] = off_table[n] -
> +                        (MAX_FRAMESIZE + s->aw_pitch_range) / 2;
> +                    first_idx[1]             = n;
> +                }
> +            }

Does off_table[n] >= MAX_FRAMESIZE have the same behavior than 
off_table[n] == NO_OFFSET? If yes, it can be simpified by making 
off_table[n] >= MAX_FRAMESIZE impossible.

> +        } else if (off_table[n] >= 0) { ///< block[0]
> +            if (s->aw_n_pulses[0]++ == 0) {
> +                s->aw_first_pulse_off[0] =
> +                    off_table[n] - s->aw_pitch_range / 2;
> +                first_idx[0]             = n;
> +            }
> +        }

Also

int idx = off_table[n] >= MAX_FRAMESIZE / 2;
if (s->aw_n_pulses[idx]++ == 0) {
     s->aw_first_pulse_off[idx] = off_table[n] -
         (MAX_FRAMESIZE + s->aw_pitch_range) / 2;
     first_idx[idx]             = n;
}

> +    }
> +    if (first_idx[0] > 0)
> +        while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pitch_range > 0)
> +            s->aw_first_pulse_off[0] -= pitch[0];
> +    if (first_idx[1] > 0)
> +        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pitch_range > 0)
> +            s->aw_first_pulse_off[1] -= pitch[1];
> +}
> +
> +/**
> + * Apply second set of pitch-adaptive window pulses.
> + * @param s WMA Voice decoding context private data
> + * @param gb bit I/O context
> + * @param block_idx block index in frame [0, 1]
> + * @param pitch pitch for this block
> + * @param out target vector to apply pulses to
> + * @param size size of @out vector
> + */
> +static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
> +                          int block_idx, int pitch, float *out, int size)
> +{
> +    short arr2[32];
> +    unsigned int arr1[3];
> +    int pulse_off = s->aw_first_pulse_off[block_idx],
> +        pulse_start, n, m, idx, range;
> +    float v;
> +
> +    assert(size == MAX_FRAMESIZE / 2);
> +    if (pulse_off != NO_OFFSET)
> +        while (pulse_off + s->aw_pitch_range < 1) pulse_off += pitch;
> +
> +    if (s->aw_n_pulses[0]) {
> +        if (block_idx == 0) {
> +            range = 32;
> +        } else { ///< block_idx == 1
> +            range = 8;
> +            if (pulse_off != NO_OFFSET) pulse_off = s->aw_next_pulse_off_cache;
> +        }
> +    } else
> +        range = 16;
> +    pulse_start = pulse_off != NO_OFFSET ? pulse_off - range / 2 : 0;
> +
> +    arr1[0] = arr1[1] = arr1[2] = -1;
> +#define BIT_IS_SET(idx) arr1[idx >> 5] & (1 << (idx & 31))
> +#define UNSET_BIT(idx)  arr1[idx >> 5] &= ~(1 << (idx & 31))
> +    memset(arr2, 0, sizeof(arr2));
> +
> +    if (pulse_off != NO_OFFSET) for (n = 0; n < 11; n++) {

line break

> +        m = pulse_off + n * pitch;
> +        for (idx = m; idx < m + s->aw_pitch_range; idx++)
> +            if (idx >= 0 && idx < size) UNSET_BIT(idx);
> +    }

Is this bit array there just to optimize a range-checking?

> +    for (n = 0, m = 0; m < 500 && n < range; pulse_start++, m++) {
> +        for (idx = pulse_start; idx < 0; idx += pitch);
> +        if (idx >= size) {
> +            for (idx = 0; idx < size; idx++)
> +                if (BIT_IS_SET(idx)) break;
> +            if (idx >= size) continue;
> +        }
> +        if (BIT_IS_SET(idx)) {
> +            arr2[n++] = idx;
> +            UNSET_BIT(idx);
> +        }
> +    }

Isn't this whole loop a NOP when pulse_off == NO_OFFSET? I'd say this 
calculation need more cleanup...

> +#undef BIT_IS_SET
> +#undef UNSET_BIT
> +
> +    idx = get_bits(gb, s->aw_n_pulses[0] ? 5 - 2 * block_idx : 4);
> +    v   = get_bits1(gb) ? -1.0 : 1.0;
> +    for (n = arr2[idx]; n < size; n += pitch)
> +        out[n] += v;

AMRFixed.n = 1;
AMRFixed.x[0] = n;
AMRFixed.y[0] = v;
AMRFixed.pitch_lag = pitch;
AMRFixed.pitch_fac = 1.0;

ff_set_fixed_vector()

> +
> +    s->aw_next_pulse_off_cache = n - size; ///< relative to start of block
> +}
> +
> +/**
> + * Apply first set of pitch-adaptive window pulses.
> + * @param s WMA Voice decoding context private data
> + * @param gb bit I/O context
> + * @param block_idx block index in frame [0, 1]
> + * @param pitch pitch for this block
> + * @param out target vector to apply pulses to
> + * @param size size of @out vector
> + */
> +static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
> +                          int block_idx, int pitch, float *out, int size)
> +{
> +    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
> +    float v;
> +
> +    assert(size == MAX_FRAMESIZE / 2);
> +    if (s->aw_n_pulses[block_idx]) {
> +        int n, m;
> +
> +        if (s->aw_pitch_range == 24) { ///< 3 pulses, 1:sign + 3:index each
> +            for (n = 0; n < 3; n++, val >>= 4) {
> +                v = val & 8 ? -1.0 : 1.0;
> +                for (m = (val & 7) * 3 - n + 2 + s->aw_first_pulse_off[block_idx];
> +                     m < size; m += pitch)
> +                    if (m >= 0) out[m] = v;

.x[n] = m;
.y[n] = v;

> +            }
> +        } else { ///< 4 pulses, 1:sign + 2:index each
> +            for (n = 0; n < 4; n++, val >>= 3) {
> +                v = val & 4 ? -1.0 : 1.0;
> +                for (m = (val & 3) * 4 - n + 3 + s->aw_first_pulse_off[block_idx];
> +                     m < size; m += pitch)
> +                    if (m >= 0) out[m] = v;
> +            }
> +
> +        }
> +    } else {
> +        int num2 = (val & 0x1FF) >> 1, delta, idx;
> +
> +        if (num2 < 79)       delta = 1;
> +        else if (num2 < 156) delta = 2;
> +        else if (num2 < 231) delta = 3;
> +        else                 delta = 4;
> +        idx    = (delta * delta + num2) % 80;
> +        delta += delta - 1;
> +
> +        v = (val & 0x200) ? -1.0 : 1.0;
> +        out[idx - delta] = v;
> +        out[idx]         = (val & 1) ? -v : v;
> +    }

ff_set_fixed_vector()

> +}
> +#undef NO_OFFSET
> +
> +/**
> + * @}
> + *
> + * Generate a random number that is different for each frame/block
> + * and can be used as an index for a table with 1000 entries, if
> + * we want to read @block_size entries following.

Doxy formatting

> + *
> + * @param frame_cntr current frame number
> + * @param block_num current block index
> + * @param block_size amount of entries we want to read from a table
> + *                   that has 1000 entries
> + * @returns a unique random number for each @block_cntr/@block_num

It cannot be unique. Imagine if you have more than 1000 frames.

I'd say something like

"Returns a random number in [0, 1000-block_size] calculated from 
frame_cntr and block_size". Or even better, you can pass max == 1000 - 
block_size as a parameter and generate a prn between [0, max].

> + *          combination, which can be used as index in a table that
> + *          has a 1000 entries from which we want to read @block_size
> + *          entries.
> + */
> +
> +static int pRNG(int frame_cntr, int block_num, int block_size)
> +{
> +    int x = (block_num * 1877 + frame_cntr) % 0xFFFF;
> +    int y = (x % 9) * 5 + 6;
> +    int z = (uint16_t) (x * 49995 / y);
> +    return z % (1000 - block_size);
> +}
> +
> +/**
> + * Parse hardcoded signal for a single block.
> + * @note see synth_block().
> + */
> +
> +static void synth_block_hardcoded(AVCodecContext *ctx, GetBitContext *gb,
> +                                 int block_idx, int size,
> +                                 const struct frame_type_desc *frame_desc,
> +                                 float *excitation)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    float gain;
> +    int n, r_idx;
> +
> +    assert(size <= MAX_FRAMESIZE);
> +
> +    /**
> +     * For acb_type==0, this is where we set up the index to start reading
> +     * from @std_codebook from.
> +     */
> +    switch (frame_desc->fcb_type) {
> +    case 0: ///< silence
> +        r_idx = pRNG(s->frame_cntr, block_idx, size);
> +        gain  = s->silence_gain;
> +        break;
> +
> +    case 1: ///< explicit (hardcoded) codebook signal
> +        r_idx = get_bits(gb, 8);
> +        gain  = ff_wmavoice_gain_universal[get_bits(gb, 6)];
> +        break;
> +    }
> +
> +    /**
> +     * Clear gain prediction parameters.
> +     */
> +    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
> +
> +    /**
> +     * Apply gain to hardcoded codebook and use that as excitation signal.
> +     */
> +    for (n = 0; n < size; n++)
> +        excitation[n] = ff_wmavoice_std_codebook[r_idx + n] * gain;
> +}
> +
> +/**
> + * Parse FCB/ACB signal for a single block.
> + * @note see synth_block().
> + */
> +
> +static void synth_block_fcb_acb(AVCodecContext *ctx, GetBitContext *gb,
> +                                int block_idx, int size,
> +                                int block_pitch_sh2,
> +                                const struct frame_type_desc *frame_desc,
> +                                float *excitation)
> +{
> +    static const float gain_coeff[6] = {
> +        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
> +    };
> +    WMAVoiceContext *s = ctx->priv_data;
> +    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
> +    int n, idx, sh, block_pitch;
> +
> +    assert(size <= MAX_FRAMESIZE / 2);
> +    memset(pulses, 0, sizeof(float) * size);
> +
> +    /**
> +     * For the other frame types, this is where we apply the innovation
> +     * (fixed) codebook pulses of the speech signal.
> +     */
> +    switch (frame_desc->fcb_type) {
> +    case 2: ///< pitch-adapting window (AW) fixed codebook
> +        aw_pulse_set1(s, gb, block_idx, block_pitch_sh2 >> 2,
> +                      pulses, size);
> +
> +        aw_pulse_set2(s, gb, block_idx, block_pitch_sh2 >> 2,
> +                      pulses, size);
> +        break;
> +
> +    case 4: case 5: case 6: ///< innovation (fixed) codebook
> +        sh = 8 - frame_desc->fcb_type; ///< 4:4/5:3/6:2
> +        for (n = 0; n < 5; n++) {
> +            float pulse = get_bits1(gb) ? 1.0 : -1.0;
> +            int idx1, idx2;
> +
> +            idx1 = get_bits(gb, sh);
> +            pulses[n + 5 * idx1] += pulse;
> +            if (n < frame_desc->dbl_pulses) {
> +                idx2 = get_bits(gb, sh);
> +                pulses[n + 5 * idx2] += (idx1 >= idx2) ? pulse : -pulse;

Another AMRFixed usage case. Maybe it would be better to pass it to 
aw_pulse_setx() and do the ff_set_fixed_vector() after the switch.

> +            }
> +        }
> +        break;
> +    }
> +
> +    /**
> +     * Calculate gain for adaptive & fixed codebook signal.
> +     * @note see ff_amr_set_fixed_gain().
> +     */
> +    idx = get_bits(gb, 7);
> +    fcb_gain = ff_wmavoice_gain_codebook_fcb[idx] *
> +               expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
> +                    5.2409161640);
> +    acb_gain = ff_wmavoice_gain_codebook_acb[idx];
> +    pred_err = logf(av_clipf(ff_wmavoice_gain_codebook_fcb[idx], 0.05, 5.0));
> +
> +    if (frame_desc->n_blocks > 1)
> +        memmove(&s->gain_pred_err[8 / frame_desc->n_blocks],
> +                s->gain_pred_err,
> +                sizeof(float) * (6 - 8 / frame_desc->n_blocks));
> +    for (n = 0; n < 8 / frame_desc->n_blocks; n++)
> +        s->gain_pred_err[n] = pred_err;
> +
> +    /**
> +     * Calculation of adaptive codebook.
> +     */
> +    switch (frame_desc->acb_type) {
> +    case 1: ///< adaptive codebook
> +        for (n = 0; n < size; n++) {
> +            float v;
> +            int pitch_sh8 = (s->last_pitch_val << 8) +
> +                ((s->pitch_diff_sh16 * (block_idx * size + n)) >> 8);
> +            int pitch = (pitch_sh8 + 0x80) >> 8,
> +                idx = (((pitch << 8) - pitch_sh8) * 8 + 0x480) >> 8, m;
> +            pitch -= idx >> 3;
> +            idx &= 7;
> +            for (v = 0., m = 16; m >= 0; m--)
> +                v += ff_wmavoice_ipol1_coeffs[idx][m] *
> +                     excitation[n + m - pitch - 8];
> +            excitation[n] = v;
> +        }
> +        break;
> +
> +    case 2: ///< adaptive codebook
> +        block_pitch = block_pitch_sh2 >> 2, idx = block_pitch_sh2 & 3;
> +        if (idx--) {
> +            for (n = 0; n < size; n++) {
> +                float v;
> +                int m;
> +
> +                for (v = 0., m = 0; m < 16; m++)
> +                    v += ff_wmavoice_ipol2_coeffs[idx][m] *
> +                         excitation[m + n - 8 - block_pitch];
> +                excitation[n] = v;
> +            }
> +        } else
> +            for (n = 0; n < size; n++)
> +                excitation[n] = excitation[n - block_pitch];
> +        break;
> +    }
> +    /**
> +     * Interpolate ACB/FCB and use as excitation signal.
> +     */
> +    ff_weighted_vector_sumf(excitation, excitation, pulses,
> +                            acb_gain, fcb_gain, size);
> +}
> +
> +/**
> + * Parse data in a single block.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoding context
> + * @param gb bit I/O context
> + * @param block_idx index of the to-be-read block
> + * @param size amount of samples to be read in this block
> + * @param block_pitch_sh2 pitch for this block << 2
> + * @param lsps LSPs for (the end of) this frame
> + * @param pre_lsps LSPs for the last frame
> + * @param frame_desc frame type descriptor
> + * @param excitation target memory for the ACB+FCB interpolated signal
> + * @param synth target memory for the speech synthesis filter output
> + * @return 0 on success, <0 on error.
> + */
> +
> +static void synth_block(AVCodecContext *ctx, GetBitContext *gb,
> +                        int block_idx, int size,
> +                        int block_pitch_sh2,
> +                        const double *lsps, const double *prev_lsps,
> +                        const struct frame_type_desc *frame_desc,
> +                        float *excitation, float *synth)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    double i_lsps[MAX_LSPS];
> +    float lpcs[MAX_LSPS];
> +    float fac;
> +    int n;
> +
> +    if (frame_desc->acb_type == 0)
> +        synth_block_hardcoded(ctx, gb, block_idx, size, frame_desc, excitation);
> +    else
> +        synth_block_fcb_acb(ctx, gb, block_idx, size, block_pitch_sh2,
> +                            frame_desc, excitation);
> +
> +    /** convert interpolated LSPs to LPCs */
> +    fac = (block_idx + 0.5) / frame_desc->n_blocks;
> +    for (n = 0; n < s->lsps; n++) ///< LSF -> LSP
> +        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
> +    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
> +
> +    /**
> +     * Speech synthesis.
> +     */
> +    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
> +}
> +
> +/**
> + * Synthesize output samples for a single frame.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoder context
> + * @param gb bit I/O context (s->gb or one for cross-packet superframes)
> + * @param data pointer to output sample buffer, has space for at least 160
> + *             samples
> + * @param lsps LSP array
> + * @param prev_lsps array of previous frame's LSPs
> + * @return 0 on success, <0 on error.
> + */
> +static int synth_frame(AVCodecContext *ctx, GetBitContext *gb,
> +                       float *samples,
> +                       const double *lsps, const double *prev_lsps,
> +                       float *excitation, float *synth)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    int n, n_blocks_x2;
> +    short pitch[MAX_BLOCKS], last_block_pitch;
> +
> +    /**
> +     * Parse frame type ("frame header"), see #frame_descs.
> +     */
> +    int bd_idx = s->vbm_tree[get_vbm_bits(gb)],
> +        block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;

Is frame size ever different from MAX_FRAMESIZE?

> +    /**
> +     * Pitch (per frame type):
> +     * - type 0: unused
> +     * - type 1: provided (globally) for the whole frame. In #synth_block(),
> +     *            we derive the "pitch-per-sample" for adaptive codebook
> +     *            reading.
> +     * - type 2: provided per block (see just before the call to
> +     *            #parse_block()), so not read here.
> +     */
> +    switch (frame_descs[bd_idx].acb_type) {
> +    case 0:
> +        memset(pitch, 0, sizeof(pitch[0]) * frame_descs[bd_idx].n_blocks);
> +        break;
> +    case 1:
> +        n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
> +        s->cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
> +        if (s->last_acb_type == 0 ||
> +            20 * abs(s->cur_pitch_val - s->last_pitch_val) >
> +                (s->cur_pitch_val + s->last_pitch_val))
> +            s->last_pitch_val = s->cur_pitch_val;
> +
> +        /** pitch per frame/block */
> +        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
> +            int fac = ((n << 1) | 1);
> +
> +            pitch[n] = (fac                 * s->cur_pitch_val +
> +                        (n_blocks_x2 - fac) * s->last_pitch_val +
> +                        frame_descs[bd_idx].n_blocks) / n_blocks_x2;
> +        }
> +
> +        /** pitch per sample */
> +        s->pitch_diff_sh16 =
> +            ((s->cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
> +        break;
> +    }
> +
> +    /**
> +     * Global gain (if silence) and pitch-adaptive window coordinates.
> +     */
> +    switch (frame_descs[bd_idx].fcb_type) {
> +    case 0:
> +        s->silence_gain = ff_wmavoice_gain_silence[get_bits(gb, 8)];
> +        break;
> +    case 2:
> +        aw_parse_coords(ctx, gb, pitch);
> +        break;
> +    }
> +
> +    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
> +        int bl_pitch_sh2 = pitch[n] << 2;
> +
> +        /**
> +         * If pitch is given per block, parse that first. Per-block pitches
> +         * are encoded as an absolute value for the first block, and then
> +         * delta values for all subsequent blocks. Unit is different (also
> +         * scale is different), so we do some stupidly complex conversion.
> +         */
> +        if (frame_descs[bd_idx].acb_type == 2) {
> +            int block_pitch,
> +                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
> +                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
> +                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
> +
> +            if (n == 0) {
> +                block_pitch = get_bits(gb, s->block_pitch_nbits);
> +            } else
> +                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
> +                                 get_bits(gb, s->block_delta_pitch_nbits);
> +            /**
> +             * Convert last_ so that any next delta leads to a value within
> +             * _range.
> +             */
> +            last_block_pitch = av_clip(block_pitch,
> +                                       s->block_delta_pitch_hrange,
> +                                       s->block_pitch_range -
> +                                           s->block_delta_pitch_hrange);
> +
> +            /**
> +             * Convert semi-log-style scale back to normal scale.
> +             */
> +            if (block_pitch < t1) {
> +                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
> +            } else {
> +                block_pitch -= t1;
> +                if (block_pitch < t2) {
> +                    bl_pitch_sh2 =
> +                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
> +                } else {
> +                    block_pitch -= t2;
> +                    if (block_pitch < t3) {
> +                        bl_pitch_sh2 =
> +                            (s->block_conv_table[2] + block_pitch) << 2;
> +                    } else
> +                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
> +                }
> +            }
> +            pitch[n] = bl_pitch_sh2 >> 2;
> +        }
> +
> +        synth_block(ctx, gb, n, block_nsamples, bl_pitch_sh2,
> +                    lsps, prev_lsps, &frame_descs[bd_idx],
> +                    &excitation[n * block_nsamples],
> +                    &synth[n * block_nsamples]);
> +    }
> +
> +    /**
> +     * Averaging projection filter, if applicable. Else, just copy samples
> +     * from synthesis buffer.
> +     */
> +    if (s->do_apf) {
> +        // FIXME this is where APF would take place, currently not implemented
> +        av_log_missing_feature(ctx, "APF", 0);
> +        s->do_apf = 0;
> +    } //else
> +        for (n = 0; n < 160; n++)
> +            samples[n] = av_clipf(synth[n], -1.0, 1.0);
> +
> +    /**
> +     * Cache values for next frame.
> +     */
> +    s->frame_cntr    = (s->frame_cntr + 1) % 0xFFFF;
> +    s->last_acb_type = frame_descs[bd_idx].acb_type;
> +    switch (frame_descs[bd_idx].acb_type) {
> +    case 0: s->last_pitch_val = 0;                break;
> +    case 1: s->last_pitch_val = s->cur_pitch_val; break;
> +    case 2: s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1]; break;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * Ensure minimum value for first item, maximum value for last value,
> + * proper spacing between each value and proper ordering.
> + *
> + * @param lsps array of LSPs
> + * @param num size of @lsps array
> + *
> + * @note basically a double version of ff_acelp_reorder_lsf(), might be
> + *       useful to put in a generic location later on. Parts are also
> + *       present in ff_set_min_dist_lsf() + ff_sort_nearly_sorted_floats(),
> + *       which is in float.
> + */
> +static void stabilize_lsps(double *lsps, int num)
> +{
> +    int n, m, l;
> +
> +    /** set minimum value for first, maximum value for last and minimum
> +     * spacing between LSF values.
> +     * @note very similar to ff_set_min_dist_lsf(), but in double. */
> +    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
> +    for (n = 1; n < num; n++)
> +        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
> +    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
> +
> +    /** reorder (looks like one-time / non-recursed bubblesort)
> +     * @note very similar to ff_sort_nearly_sorted_floats(), but in double */
> +    for (n = 1; n < num; n++) {
> +        if (lsps[n] < lsps[n - 1]) {
> +            for (m = 1; m < num; m++) {
> +                double tmp = lsps[m];
> +                for (l = m - 1; l >= 0; l--) {
> +                    if (lsps[l] <= tmp) break;
> +                    lsps[l + 1] = lsps[l];
> +                }
> +                lsps[l + 1] = tmp;
> +            }
> +            break;
> +        }
> +    }
> +}
> +
> +/**
> + * Test if there's enough bits to read 1 superframe.
> + *
> + * @param orig_gb bit I/O context used for reading. This function
> + *                does not modify the state of the bitreader; it
> + *                only uses it to copy the current stream position
> + * @param s WMA Voice decoding context private data
> + * @returns -1 if unsupported, 1 on not enough bits or 0 if OK.
> + */
> +static int check_bits_for_superframe(GetBitContext *orig_gb,
> +                                     WMAVoiceContext *s)
> +{
> +    GetBitContext s_gb, *gb = &s_gb;
> +    int n, need_bits, bd_idx;
> +    const struct frame_type_desc *frame_desc;
> +
> +    /* initialize a copy */
> +    init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
> +    skip_bits_long(gb, get_bits_count(orig_gb));
> +    assert(get_bits_left(gb) == get_bits_left(orig_gb));
> +
> +    /* superframe header */
> +    if (get_bits_left(gb) < 14)
> +        return 1;
> +    if (!get_bits1(gb))
> +        return -1; ///< WMAPro-in-WMAVoice superframe
> +    if (get_bits1(gb)) skip_bits(gb, 12); ///< number of  samples in superframe
> +    if (s->has_residual_lsps) { ///< residual LSPs (for all frames)
> +        if (get_bits_left(gb) < s->sframe_lsp_bitsize)
> +            return 1;
> +        skip_bits_long(gb, s->sframe_lsp_bitsize);
> +    }
> +
> +    /* frames */
> +    for (n = 0; n < MAX_FRAMES; n++) {
> +        int aw_idx_is_ext = 0;
> +
> +        if (!s->has_residual_lsps) { ///< independent LSPs (per-frame)
> +           if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
> +           skip_bits_long(gb, s->frame_lsp_bitsize);
> +        }
> +        bd_idx = s->vbm_tree[get_vbm_bits(gb)]; ///< frame type ("header")
> +        frame_desc = &frame_descs[bd_idx];
> +        if (frame_desc->acb_type == 1) {
> +            if (get_bits_left(gb) < s->pitch_nbits)
> +                return 1;
> +            skip_bits_long(gb, s->pitch_nbits);
> +        }
> +        if (frame_desc->fcb_type == 0) {
> +            skip_bits(gb, 8);
> +        } else if (frame_desc->fcb_type == 2) {
> +            int tmp = get_bits(gb, 6);
> +            if (tmp >= 0x36) {
> +                skip_bits(gb, 2);
> +                aw_idx_is_ext = 1;
> +            }
> +        }
> +
> +        /* blocks */
> +        if (frame_desc->acb_type == 2) {
> +            need_bits = s->block_pitch_nbits +
> +                (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
> +        } else if (frame_desc->fcb_type == 2) {
> +            need_bits = 2 * !aw_idx_is_ext;
> +        } else
> +            need_bits = 0;
> +        need_bits += frame_desc->frame_size;
> +        if (get_bits_left(gb) < need_bits)
> +            return 1;
> +        skip_bits_long(gb, need_bits);
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * Synthesize output samples for a single superframe. If we have any data
> + * cached in s->sframe_cache, that will be used instead of whatever is loaded
> + * in s->gb.
> + *
> + * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
> + * to give a total of 480 samples per frame. See #synth_frame() for frame
> + * parsing. In addition to 3 frames, superframes can also contain the LSPs
> + * (if these are globally specified for all frames (residually); they can
> + * also be specified individually per-frame. See the s->has_residual_lsps
> + * option), and can specify the number of samples encoded in this superframe
> + * (if less than 480), usually used to prevent blanks at track boundaries.
> + *
> + * @param s WMA Voice decoder context
> + * @param data pointer to output buffer for voice samples
> + * @param data_size pointer containing the size of @data on input, and the
> + *                  amount of @data filled on output
> + * @return 0 on success, <0 on error or 1 if there was not enough data to
> + *         fully parse the superframe
> + */
> +static int synth_superframe(AVCodecContext *ctx,
> +                            float *samples, int *data_size)
> +{
> +    WMAVoiceContext *s = ctx->priv_data;
> +    GetBitContext *gb = &s->gb, s_gb;
> +    int n, res, n_samples = 480;
> +    double lsps[MAX_FRAMES][MAX_LSPS];
> +    const double *mean_lsf = s->lsps == 16 ?
> +        ff_wmavoice_mean_lsf16[s->lsp_def_mode] : ff_wmavoice_mean_lsf10[s->lsp_def_mode];
> +    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
> +    float synth[MAX_LSPS + MAX_SFRAMESIZE];
> +
> +    memcpy(synth,      s->synth_history,
> +           s->lsps             * sizeof(float));
> +    memcpy(excitation, s->excitation_history,
> +           s->history_nsamples * sizeof(float));

I slightly prefer (one memcpy less) instead of

memcpy(synth, s->synth_hist, mem_size);
decode(synth);
memcpy(s->synth_hist, synth + size - mem_size, mem_size);

doing

decode(s->synth);
memmove(s->synth, s->synth + size, mem_size);

> +    if (s->cache_sframe_size > 0) {
> +        gb = &s_gb;
> +        init_get_bits(gb, s->sframe_cache, s->cache_sframe_size);
> +        s->cache_sframe_size = 0;
> +    }
> +
> +    if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
> +
> +    /**
> +     * First bit is speech/music bit, it differentiates between WMAVoice
> +     * speech samples (the actual codec) and WMAVoice music samples, which
> +     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
> +     * the wild yet.
> +     */
> +    if (!get_bits1(gb)) {
> +        av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
> +        return -1;
> +    }
> +
> +    /**
> +     * (optional) nr. of samples in superframe; always <= 480 and >= 0.
> +     */
> +    if (get_bits1(gb)) {
> +        if ((n_samples = get_bits(gb, 12)) > 480) {
> +            av_log(ctx, AV_LOG_ERROR,
> +                   "Superframe encodes >480 samples (%d), not allowed\n",
> +                   n_samples);
> +            return -1;
> +        }
> +    }
> +    /**
> +     * Parse LSPs, if global for the whole superframe (can also be per-frame).
> +     */
> +    if (s->has_residual_lsps) {
> +        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
> +
> +        for (n = 0; n < s->lsps; n++)
> +            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
> +
> +        if (s->lsps == 10) {
> +            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
> +        } else /* s->lsps == 16 */
> +            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
> +
> +        for (n = 0; n < s->lsps; n++) {
> +            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
> +            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
> +            lsps[2][n] += mean_lsf[n];
> +        }
> +        for (n = 0; n < 3; n++)
> +            stabilize_lsps(lsps[n], s->lsps);
> +    }
> +
> +    /**
> +     * Parse frames, optionally preceeded by per-frame (independent) LSPs.
> +     */
> +    for (n = 0; n < 3; n++) {
> +        if (!s->has_residual_lsps) {
> +            int m;
> +
> +            if (s->lsps == 10) {
> +                dequant_lsp10i(gb, lsps[n]);
> +            } else /* s->lsps == 16 */
> +                dequant_lsp16i(gb, lsps[n]);
> +
> +            for (m = 0; m < s->lsps; m++)
> +                lsps[n][m] += mean_lsf[m];
> +            stabilize_lsps(lsps[n], s->lsps);
> +        }
> +
> +        if ((res = synth_frame(ctx, gb,
> +                               &samples[n * MAX_FRAMESIZE],
> +                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
> +                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
> +                               &synth[s->lsps + n * MAX_FRAMESIZE])))
> +            return res;
> +    }
> +
> +    /**
> +     * Statistics? FIXME - we don't check for length, a slight overrun
> +     * will be caught by internal buffer padding, and anything else
> +     * will be skipped, not read.
> +     */
> +    if (get_bits1(gb)) {
> +        res = get_bits(gb, 4);
> +        skip_bits(gb, 10 * (res + 1));
> +    }
> +
> +    /**
> +     * Specify nr. of output samples.
> +     */
> +    *data_size = n_samples * sizeof(float);
> +
> +    /**
> +     * Update history.
> +     */
> +    memcpy(s->prev_lsps,           lsps[2],
> +           s->lsps             * sizeof(double));
> +    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
> +           s->lsps             * sizeof(float));
> +    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
> +           s->history_nsamples * sizeof(float));
> +
> +    return 0;
> +}
> +
> +/**
> + * Parse the packet header at the start of each packet (input data to this
> + * decoder).
> + *
> + * @param s WMA Voice decoding context private data
> + * @returns 1 if not enough bits were available, or 0 on success.
> + */
> +static int parse_packet_header(WMAVoiceContext *s)
> +{
> +    GetBitContext *gb = &s->gb;
> +    unsigned int res;
> +
> +    if (get_bits_left(gb) < 11)
> +        return 1;

Can this ever happen?

-Vitor