[FFmpeg-devel] [PATCH] WMA Voice decoder
Vitor Sessak
vitor1001
Sat Jan 23 06:06:22 CET 2010
Ronald S. Bultje wrote:
> Hi,
>
> On Tue, Jan 19, 2010 at 7:38 PM, Ronald S. Bultje <rsbultje at gmail.com> wrote:
>> my first decoder, please be kind. :-).
Some comments (as kind as ffmpeg review tradition allows) ;-)
> + */
> +static const struct frame_type_desc {
> + short n_blocks, ///< amount of blocks per frame
> + ///< (each block contains 160/#n_blocks samples)
> + acb_type, ///< Adaptive codebook type in frame/block:
> + ///< - 0: fixed codebook with per-block/frame gain,
> + ///< - 1: adaptive codebook with per-frame pitch,
> + ///< - 2: adaptive codebook with per-block pitch
> + fcb_type, ///< Fixed codebook type in frame/block:
> + ///< - 0: hardcoded codebook, per-frame gain,
fcb_type == 0 is silence + confort noise, no? Also I think those should
be an enum...
> + int frequency_domain; ///< defines which table to use during APF
> + ///< frequency domain filtering [0-7]
> + int spectrum_corr; ///< whether to do spectrum tilt correction
> + ///< in APF
> + int apf_project_mode; ///< defines the filter projection mode in
> + ///< APF [0-7]
I think it is better to leave those for the patch adding APF.
> + uint8_t vbm_tree[25]; ///< variable bit mode coding tree
> + int history_nsamples; ///< number of samples in history for signal
> + ///< prediction (through ACB)
> +
> + int lsps; ///< number of LSPs per frame [10 or 16]
> + int lsp_q_mode; ///< defines quantizer defaults [0, 1]
> + int lsp_def_mode; ///< defines different sets of LSP defaults
> + ///< [0, 1]
> + int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
> + ///< per-frame (independent coding)
> + int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
> + ///< per superframe (residual coding)
> +
> + int min_pitch_val; ///< base value for pitch parsing code
> + int max_pitch_val; ///< max value + 1 for pitch parsing
> + int pitch_nbits; ///< number of bits used to specify the
> + ///< pitch value in the frame header
> + int block_pitch_nbits; ///< number of bits used to specify the
> + ///< first block's pitch value
> + int block_pitch_range; ///< range of the block pitch
> + int block_delta_pitch_nbits; ///< number of bits used to specify the
> + ///< delta pitch between this and the last
> + ///< block's pitch value, used in all but
> + ///< first block
> + int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
> + ///< from -this to +this-1)
> + uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
> + ///< conversion
> +
> + /**
> + * @}
> + * @defgroup struct_packet Packet values
> + * Packet values, specified in the packet header or related to a packet.
> + * A packet is considered to be a single unit of data provided to this
> + * decoder by the demuxer.
> + * @{
> + */
> + int spillover_nbits; ///< number of bits of the previous packet's
> + ///< last superframe preceeding this
> + ///< packet's first full superframe (useful
> + ///< for re-synchronization also)
> + int has_residual_lsps; ///< if set, superframes contain one set of
> + ///< LSPs that cover all frames, encoded as
> + ///< independent and residual LSPs; if not
> + ///< set, each frame contains its own, fully
> + ///< independent, LSPs
> + int skip_bits_next; ///< number of bits to skip at the next call
> + ///< to #wmavoice_decode_packet() (since
> + ///< they're part of the previous superframe)
> +
> + uint8_t sframe_cache[SFRAME_CACHE_SIZE + FF_INPUT_BUFFER_PADDING_SIZE];
> + ///< cache for superframe data split over
> + ///< multiple packets
> + PutBitContext pb; ///< points into #sframe_cache
> + int cache_sframe_size; ///< set to >0 if we have data from an
> + ///< (incomplete) superframe from a previous
> + ///< packet that spilled over in the current
> + ///< packet; specifies the amount of bits in
> + ///< #sframe_cache
> +
> + /**
> + * @}
> + * @defgroup struct_frame Frame and superframe values
> + * Superframe and frame data - these can change from frame to frame,
> + * although some of them do in that case serve as a cache / history for
> + * the next frame or superframe.
> + * @{
> + */
> + double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
> + ///< superframe
> + int cur_pitch_val; ///< pitch value of the current frame
can be a local var
> + int last_pitch_val; ///< pitch value of the previous frame
> + int last_acb_type; ///< frame type [0-2] of the previous frame
> + int pitch_diff_sh16; ///< ((#cur_pitch_val - #last_pitch_val)
> + ///< << 16) / #MAX_FRAME_SIZE
> + float silence_gain; ///< set for use in blocks if acb_type == 0
Do not need to be on the context, but I don't know if making this local
wouldn't make the code uglier.
> + int aw_idx_is_ext; ///< whether the AW index was encoded in
> + ///< 8 bits (instead of 6)
> + int aw_pitch_range;
> + short aw_n_pulses[2]; ///< number of AW-pulses in each block
> + short aw_first_pulse_off[2]; ///< index of first sample to which to
> + ///< apply AW-pulses, or -0xff if unset
> + int aw_next_pulse_off_cache;
> +
> + int frame_cntr; ///< current frame index [0 - 0xFFFF]
Please add to the doxy that this is only used for prng
> + float gain_pred_err[6]; ///< cache for future gain prediction
> + float excitation_history[MAX_SIGNAL_HISTORY];
> + ///< cache of the signal of previous
> + ///< superframes, used as a history for
> + ///< future signal generation
> + float synth_history[MAX_LSPS]; ///< see #excitation_history
> + /**
> + * @}
> + */
> +} WMAVoiceContext;
> +
> +/**
> + * Sets up the variable bit mode (VBM) tree from container extradata.
> + * @param s WMA Voice decoding context.
> + * The bit context (s->gb) should be loaded with byte 23-46 of the
> + * container extradata (i.e. the ones containing the VBM tree).
> + * @return 0 on success, <0 on error.
> + */
> +static av_cold int decode_vbmtree(WMAVoiceContext *s)
> +{
> + GetBitContext *gb = &s->gb;
> + unsigned int cntr[8], n, res;
> +
> + memset(s->vbm_tree, -1, sizeof(s->vbm_tree));
> + memset(cntr, 0, sizeof(cntr));
> + for (n = 0; n < 17; n++) {
> + res = get_bits(gb, 3);
> + if (cntr[res] >= 3 + (res == 7))
> + return -1;
> + s->vbm_tree[res * 3 + cntr[res]++] = n;
> + }
> +
> + return 0;
> +}
I'd prefer if you pass a GetBitContext and s->vbm_tree as parameters.
Also, it looks reasonable to me to make the GetBitContext of
decoder_init() a local var, to avoid having s->gb meaning pointing to
semantically different things (extradata on init and frame_data in
decoding).
> +/**
> + * Initialize decoder.
> + */
> +static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
> +{
> + int n, flags, pitch_range;
> + WMAVoiceContext *s = ctx->priv_data;
> +
> + /**
> + * Extradata layout:
> + * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
> + * - byte 19-22: flags field (annoyingly in LE; see below for known
> + * values),
Hmm, isn't the endianness of the flags a convention you can choose?
> + * - byte 23-46: variable bitmode tree (really just 25 * 3 bits,
> + * rest is 0).
> + */
> + if (ctx->extradata_size != 0x2E) {
> + av_log(ctx, AV_LOG_ERROR,
> + "Invalid extradata size 0x%x != 0x2E\n", ctx->extradata_size);
> + return -1;
> + }
> + flags = AV_RL32(ctx->extradata + 18);
> + s->block_align = ctx->block_align;
> + s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
> + s->do_apf = flags & 0x1;
> + s->frequency_domain = (flags >> 2) & 0xF;
> + s->spectrum_corr = flags & 0x40;
> + s->apf_project_mode = (flags >> 7) & 0xF;
> + s->lsp_q_mode = flags & 0x2000;
> + s->lsp_def_mode = flags & 0x4000;
> + if (flags & 0x1000) {
> + s->lsps = 16;
> + s->frame_lsp_bitsize = 34;
> + s->sframe_lsp_bitsize = 60;
> + } else {
> + s->lsps = 10;
> + s->frame_lsp_bitsize = 24;
> + s->sframe_lsp_bitsize = 48;
> + }
> + for (n = 0; n < s->lsps; n++)
> + s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
> +
> + init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
> + if (decode_vbmtree(s) < 0) {
> + av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree\n");
> + return -1;
> + }
I think a "Invalid metadata" or "Invalid VBM tree, broken metadata?"
could be more handy for someone debugging a demuxer.
> + s->min_pitch_val = ((int) ((ctx->sample_rate << 8) * 0.0025) + 50) >> 8;
> + s->max_pitch_val = ((int) ((ctx->sample_rate << 8) * 0.0185) + 50) >> 8;
I think this is better done with integer math
> + pitch_range = s->max_pitch_val - s->min_pitch_val;
> + s->pitch_nbits = av_ceil_log2(pitch_range);
> + s->last_pitch_val = 40;
> + s->last_acb_type = 0;
> + s->history_nsamples = s->max_pitch_val + 8;
> + if (s->history_nsamples > MAX_SIGNAL_HISTORY) {
> + av_log(ctx, AV_LOG_ERROR, "Signal history too big: %d (max=%d), probably broken file\n",
> + s->history_nsamples, MAX_SIGNAL_HISTORY);
"Unsupported sample rate: %d"?
> + return -1;
> + }
> +
> + s->block_conv_table[0] = s->min_pitch_val;
> + s->block_conv_table[1] = (pitch_range * 25) >> 6;
> + s->block_conv_table[2] = (pitch_range * 44) >> 6;
> + s->block_conv_table[3] = s->max_pitch_val - 1;
> + s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
> + s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
> + s->block_pitch_range = s->block_conv_table[2] +
> + s->block_conv_table[3] + 1 +
> + 2 * (s->block_conv_table[1] -
> + (2 * s->min_pitch_val));
> + s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
> +
> + ctx->sample_fmt = SAMPLE_FMT_FLT;
> +
> + return 0;
> +}
> +
> +/**
> + * Read an integer coded as a variable-bit number.
> + * @param gb bit I/O context
> + */
> +static int get_vbm_bits(GetBitContext *gb)
> +{
> + int n, res;
> +
> + for (n = 0; ; n++) {
> + res = get_bits(gb, 2);
> + if (res < 3 || n == 6 /** don't increase n to 7 */)
> + break;
> + }
> +
> + return 3 * n + res;
> +}
I've never actually used the {init,get}_vlc() functions, so I'm not
sure, but it looks like they could be useful here.
> +/**
> + * Dequantize LSPs
> + * @param lsps pointer to an array of LSPs, holding at least @num values
> + * @param num number of LSPs to be dequantized
> + * @param values quantized values, contains @n_stages values
> + * @param sizes range (well, max. value) of each quantized value in @values
> + * @param n_stages number of dequantization runs
> + * @param table dequantization table to be used
> + * @param mul_q LSF multiplier
> + * @param base_q base (lowest) LSF values
> + */
> +static void dequant_lsps(double *lsps, int num,
> + const uint16_t *values, const uint16_t *sizes,
> + int n_stages, const uint8_t *table,
> + const double *mul_q, const double *base_q)
> +{
> + int n, m;
> +
> + for (n = 0; n < num; n++) lsps[n] = 0.0;
memset?
> + for (n = 0; n < n_stages; table += sizes[n++] * num)
> + for (m = 0; m < num; m++)
> + lsps[m] += base_q[n] + mul_q[n] * table[m + values[n] * num];
> +}
> +
> +/**
> + * @defgroup lsp_dequant LSP dequantization routines
> + * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
> + * @note we assume enough bits are available, caller should check.
> + * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
> + * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
> + * @{
> + */
> +/**
> + * Parse 10 independently-coded LSPs.
> + */
> +static void dequant_lsp10i(GetBitContext *gb, double *lsps)
> +{
> + static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
> + static const double mul_lsf[4] = {
> + 5.2187144800e-3, 1.4626986422e-3,
> + 9.6179549166e-4, 1.1325736225e-3
> + };
> + static const double base_lsf[4] = {
> + M_PI * -2.15522e-1, M_PI * -6.1646e-2,
> + M_PI * -3.3486e-2, M_PI * -5.7408e-2
> + };
> + uint16_t v[4];
> +
> + v[0] = get_bits(gb, 8);
> + v[1] = get_bits(gb, 6);
> + v[2] = get_bits(gb, 5);
> + v[3] = get_bits(gb, 5);
> +
> + dequant_lsps(lsps, 10, v, vec_sizes, 4, ff_wmavoice_dq_lsp10i,
> + mul_lsf, base_lsf);
> +}
> +
> +/**
> + * Parse 10 independently-coded LSPs, and then derive the tables to
> + * generate LSPs for the other frames from them (residual coding).
> + */
> +static void dequant_lsp10r(GetBitContext *gb,
> + double *i_lsps, const double *old,
> + double *a1, double *a2, int q_mode)
> +{
> + static const uint16_t vec_sizes[3] = { 0x80, 0x40, 0x40 };
I prefer decimal
> + static const double mul_lsf[3] = {
> + 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
> + };
> + static const double base_lsf[3] = {
> + M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
> + };
> + const float (*ipol_tab)[2][10] = q_mode ?
> + ff_wmavoice_lsp10_intercoeff_b : ff_wmavoice_lsp10_intercoeff_a;
> + uint16_t interpol, v[3];
> + int n;
> +
> + dequant_lsp10i(gb, i_lsps);
> +
> + interpol = get_bits(gb, 5);
> + v[0] = get_bits(gb, 7);
> + v[1] = get_bits(gb, 6);
> + v[2] = get_bits(gb, 6);
> +
> + for (n = 0; n < 10; n++) {
> + a1[n] = ipol_tab[interpol][0][n] * (old[n] - i_lsps[n]) +
> + i_lsps[n];
> + a1[10 + n] = ipol_tab[interpol][1][n] * (old[n] - i_lsps[n]) +
> + i_lsps[n];
> + }
> +
> + dequant_lsps(a2, 20, v, vec_sizes, 3, ff_wmavoice_dq_lsp10r,
> + mul_lsf, base_lsf);
> +}
> +
> +/**
> + * Parse 16 independently-coded LSPs.
> + */
> +static void dequant_lsp16i(GetBitContext *gb, double *lsps)
> +{
> + static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
> + static const double mul_lsf[5] = {
> + 3.3439586280e-3, 6.9908173703e-4,
> + 3.3216608306e-3, 1.0334960326e-3,
> + 3.1899104283e-3
> + };
> + static const double base_lsf[5] = {
> + M_PI * -1.27576e-1, M_PI * -2.4292e-2,
> + M_PI * -1.28094e-1, M_PI * -3.2128e-2,
> + M_PI * -1.29816e-1
> + };
> + uint16_t v[5];
> +
> + v[0] = get_bits(gb, 8);
> + v[1] = get_bits(gb, 6);
> + v[2] = get_bits(gb, 7);
> + v[3] = get_bits(gb, 6);
> + v[4] = get_bits(gb, 7);
> +
> + dequant_lsps( lsps, 5, v, vec_sizes, 2,
> + ff_wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
> + dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
> + ff_wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
> + dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
> + ff_wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
> +}
> +
> +/**
> + * Parse 16 independently-coded LSPs, and then derive the tables to
> + * generate LSPs for the other frames from them (residual coding).
> + */
> +static void dequant_lsp16r(GetBitContext *gb,
> + double *i_lsps, const double *old,
> + double *a1, double *a2, int q_mode)
> +{
> + static const uint16_t vec_sizes[3] = { 0x80, 0x80, 0x80 };
> + static const double mul_lsf[3] = {
> + 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
> + };
> + static const double base_lsf[3] = {
> + M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
> + };
> + const float (*ipol_tab)[2][16] = q_mode ?
> + ff_wmavoice_lsp16_intercoeff_b : ff_wmavoice_lsp16_intercoeff_a;
> + uint16_t interpol, v[3];
> + int n;
> +
> + dequant_lsp16i(gb, i_lsps);
> +
> + interpol = get_bits(gb, 5);
> + v[0] = get_bits(gb, 7);
> + v[1] = get_bits(gb, 7);
> + v[2] = get_bits(gb, 7);
> +
> + for (n = 0; n < 16; n++) {
> + a1[n] = ipol_tab[interpol][0][n] * (old[n] - i_lsps[n]) +
> + i_lsps[n];
> + a1[16 + n] = ipol_tab[interpol][1][n] * (old[n] - i_lsps[n]) +
> + i_lsps[n];
> + }
> +
> + dequant_lsps( a2, 10, v, vec_sizes, 1,
> + ff_wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
> + dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
> + ff_wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
> + dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
> + ff_wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
> +}
Semi-duplicated code, but hard to see how to factor it...
> +/**
> + * @}
> + * @defgroup aw Pitch-adaptive window coding functions
> + * The next few functions are for pitch-adaptive window coding.
> + * @{
> + */
> +#define NO_OFFSET -0xff
> +/**
> + * Parse the offset of the first pitch-adaptive window pulses, and
> + * the distribution of pulses between the two blocks in this frame.
> + * @param ctx WMA Voice decoding context
> + * @param gb bit I/O context
> + * @param pitch pitch for each block in this frame
> + */
> +
> +static void aw_parse_coords(AVCodecContext *ctx, GetBitContext *gb,
> + const short *pitch)
> +{
> + static const uint8_t start_offset[94] = {
> + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
> + 24, 26, 29, 28, 30, 31, 32, 33, 34, 35, 36, 37,
> + 38, 39, 40, 41, 42, 43, 44, 46, 48, 50, 52, 54,
> + 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78,
> + 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102,
> + 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
> + 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
> + 152, 154, 156, 158, 160, 162, 164, 166, 168, 170
> + };
> + WMAVoiceContext *s = ctx->priv_data;
> + int bits, n, offset;
> + short off_table[11], first_idx[2];
> +
> + s->aw_idx_is_ext = 0;
> + if ((bits = get_bits(gb, 6)) >= 54) {
> + s->aw_idx_is_ext = 1;
> + bits += (bits - 54) * 3 + get_bits(gb, 2);
> + }
> + s->aw_pitch_range = 16 + 8 * (FFMIN(pitch[0], pitch[1]) > 32);
> +
> + for (offset = start_offset[bits] - 11, n = 0;
> + offset < MAX_FRAMESIZE + s->aw_pitch_range / 2 && n < 11;
> + offset += pitch[offset >= MAX_FRAMESIZE / 2], n++)
> + off_table[n] = offset;
Please do not write all this in a single statement.
> + while (n < 11) off_table[n++] = NO_OFFSET;
Line break
> + s->aw_n_pulses[0] = s->aw_n_pulses[1] = 0;
> + s->aw_first_pulse_off[0] = s->aw_first_pulse_off[1] = NO_OFFSET;
> + first_idx[0] = first_idx[1] = 0;
> + for (n = 0; n < 11; n++) {
> + if (off_table[n] >= MAX_FRAMESIZE / 2) {
> + if (off_table[n] < MAX_FRAMESIZE) { ///< block[1]
> + if (s->aw_n_pulses[1]++ == 0) {
> + s->aw_first_pulse_off[1] = off_table[n] -
> + (MAX_FRAMESIZE + s->aw_pitch_range) / 2;
> + first_idx[1] = n;
> + }
> + }
Does off_table[n] >= MAX_FRAMESIZE have the same behavior than
off_table[n] == NO_OFFSET? If yes, it can be simpified by making
off_table[n] >= MAX_FRAMESIZE impossible.
> + } else if (off_table[n] >= 0) { ///< block[0]
> + if (s->aw_n_pulses[0]++ == 0) {
> + s->aw_first_pulse_off[0] =
> + off_table[n] - s->aw_pitch_range / 2;
> + first_idx[0] = n;
> + }
> + }
Also
int idx = off_table[n] >= MAX_FRAMESIZE / 2;
if (s->aw_n_pulses[idx]++ == 0) {
s->aw_first_pulse_off[idx] = off_table[n] -
(MAX_FRAMESIZE + s->aw_pitch_range) / 2;
first_idx[idx] = n;
}
> + }
> + if (first_idx[0] > 0)
> + while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pitch_range > 0)
> + s->aw_first_pulse_off[0] -= pitch[0];
> + if (first_idx[1] > 0)
> + while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pitch_range > 0)
> + s->aw_first_pulse_off[1] -= pitch[1];
> +}
> +
> +/**
> + * Apply second set of pitch-adaptive window pulses.
> + * @param s WMA Voice decoding context private data
> + * @param gb bit I/O context
> + * @param block_idx block index in frame [0, 1]
> + * @param pitch pitch for this block
> + * @param out target vector to apply pulses to
> + * @param size size of @out vector
> + */
> +static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
> + int block_idx, int pitch, float *out, int size)
> +{
> + short arr2[32];
> + unsigned int arr1[3];
> + int pulse_off = s->aw_first_pulse_off[block_idx],
> + pulse_start, n, m, idx, range;
> + float v;
> +
> + assert(size == MAX_FRAMESIZE / 2);
> + if (pulse_off != NO_OFFSET)
> + while (pulse_off + s->aw_pitch_range < 1) pulse_off += pitch;
> +
> + if (s->aw_n_pulses[0]) {
> + if (block_idx == 0) {
> + range = 32;
> + } else { ///< block_idx == 1
> + range = 8;
> + if (pulse_off != NO_OFFSET) pulse_off = s->aw_next_pulse_off_cache;
> + }
> + } else
> + range = 16;
> + pulse_start = pulse_off != NO_OFFSET ? pulse_off - range / 2 : 0;
> +
> + arr1[0] = arr1[1] = arr1[2] = -1;
> +#define BIT_IS_SET(idx) arr1[idx >> 5] & (1 << (idx & 31))
> +#define UNSET_BIT(idx) arr1[idx >> 5] &= ~(1 << (idx & 31))
> + memset(arr2, 0, sizeof(arr2));
> +
> + if (pulse_off != NO_OFFSET) for (n = 0; n < 11; n++) {
line break
> + m = pulse_off + n * pitch;
> + for (idx = m; idx < m + s->aw_pitch_range; idx++)
> + if (idx >= 0 && idx < size) UNSET_BIT(idx);
> + }
Is this bit array there just to optimize a range-checking?
> + for (n = 0, m = 0; m < 500 && n < range; pulse_start++, m++) {
> + for (idx = pulse_start; idx < 0; idx += pitch);
> + if (idx >= size) {
> + for (idx = 0; idx < size; idx++)
> + if (BIT_IS_SET(idx)) break;
> + if (idx >= size) continue;
> + }
> + if (BIT_IS_SET(idx)) {
> + arr2[n++] = idx;
> + UNSET_BIT(idx);
> + }
> + }
Isn't this whole loop a NOP when pulse_off == NO_OFFSET? I'd say this
calculation need more cleanup...
> +#undef BIT_IS_SET
> +#undef UNSET_BIT
> +
> + idx = get_bits(gb, s->aw_n_pulses[0] ? 5 - 2 * block_idx : 4);
> + v = get_bits1(gb) ? -1.0 : 1.0;
> + for (n = arr2[idx]; n < size; n += pitch)
> + out[n] += v;
AMRFixed.n = 1;
AMRFixed.x[0] = n;
AMRFixed.y[0] = v;
AMRFixed.pitch_lag = pitch;
AMRFixed.pitch_fac = 1.0;
ff_set_fixed_vector()
> +
> + s->aw_next_pulse_off_cache = n - size; ///< relative to start of block
> +}
> +
> +/**
> + * Apply first set of pitch-adaptive window pulses.
> + * @param s WMA Voice decoding context private data
> + * @param gb bit I/O context
> + * @param block_idx block index in frame [0, 1]
> + * @param pitch pitch for this block
> + * @param out target vector to apply pulses to
> + * @param size size of @out vector
> + */
> +static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
> + int block_idx, int pitch, float *out, int size)
> +{
> + int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
> + float v;
> +
> + assert(size == MAX_FRAMESIZE / 2);
> + if (s->aw_n_pulses[block_idx]) {
> + int n, m;
> +
> + if (s->aw_pitch_range == 24) { ///< 3 pulses, 1:sign + 3:index each
> + for (n = 0; n < 3; n++, val >>= 4) {
> + v = val & 8 ? -1.0 : 1.0;
> + for (m = (val & 7) * 3 - n + 2 + s->aw_first_pulse_off[block_idx];
> + m < size; m += pitch)
> + if (m >= 0) out[m] = v;
.x[n] = m;
.y[n] = v;
> + }
> + } else { ///< 4 pulses, 1:sign + 2:index each
> + for (n = 0; n < 4; n++, val >>= 3) {
> + v = val & 4 ? -1.0 : 1.0;
> + for (m = (val & 3) * 4 - n + 3 + s->aw_first_pulse_off[block_idx];
> + m < size; m += pitch)
> + if (m >= 0) out[m] = v;
> + }
> +
> + }
> + } else {
> + int num2 = (val & 0x1FF) >> 1, delta, idx;
> +
> + if (num2 < 79) delta = 1;
> + else if (num2 < 156) delta = 2;
> + else if (num2 < 231) delta = 3;
> + else delta = 4;
> + idx = (delta * delta + num2) % 80;
> + delta += delta - 1;
> +
> + v = (val & 0x200) ? -1.0 : 1.0;
> + out[idx - delta] = v;
> + out[idx] = (val & 1) ? -v : v;
> + }
ff_set_fixed_vector()
> +}
> +#undef NO_OFFSET
> +
> +/**
> + * @}
> + *
> + * Generate a random number that is different for each frame/block
> + * and can be used as an index for a table with 1000 entries, if
> + * we want to read @block_size entries following.
Doxy formatting
> + *
> + * @param frame_cntr current frame number
> + * @param block_num current block index
> + * @param block_size amount of entries we want to read from a table
> + * that has 1000 entries
> + * @returns a unique random number for each @block_cntr/@block_num
It cannot be unique. Imagine if you have more than 1000 frames.
I'd say something like
"Returns a random number in [0, 1000-block_size] calculated from
frame_cntr and block_size". Or even better, you can pass max == 1000 -
block_size as a parameter and generate a prn between [0, max].
> + * combination, which can be used as index in a table that
> + * has a 1000 entries from which we want to read @block_size
> + * entries.
> + */
> +
> +static int pRNG(int frame_cntr, int block_num, int block_size)
> +{
> + int x = (block_num * 1877 + frame_cntr) % 0xFFFF;
> + int y = (x % 9) * 5 + 6;
> + int z = (uint16_t) (x * 49995 / y);
> + return z % (1000 - block_size);
> +}
> +
> +/**
> + * Parse hardcoded signal for a single block.
> + * @note see synth_block().
> + */
> +
> +static void synth_block_hardcoded(AVCodecContext *ctx, GetBitContext *gb,
> + int block_idx, int size,
> + const struct frame_type_desc *frame_desc,
> + float *excitation)
> +{
> + WMAVoiceContext *s = ctx->priv_data;
> + float gain;
> + int n, r_idx;
> +
> + assert(size <= MAX_FRAMESIZE);
> +
> + /**
> + * For acb_type==0, this is where we set up the index to start reading
> + * from @std_codebook from.
> + */
> + switch (frame_desc->fcb_type) {
> + case 0: ///< silence
> + r_idx = pRNG(s->frame_cntr, block_idx, size);
> + gain = s->silence_gain;
> + break;
> +
> + case 1: ///< explicit (hardcoded) codebook signal
> + r_idx = get_bits(gb, 8);
> + gain = ff_wmavoice_gain_universal[get_bits(gb, 6)];
> + break;
> + }
> +
> + /**
> + * Clear gain prediction parameters.
> + */
> + memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
> +
> + /**
> + * Apply gain to hardcoded codebook and use that as excitation signal.
> + */
> + for (n = 0; n < size; n++)
> + excitation[n] = ff_wmavoice_std_codebook[r_idx + n] * gain;
> +}
> +
> +/**
> + * Parse FCB/ACB signal for a single block.
> + * @note see synth_block().
> + */
> +
> +static void synth_block_fcb_acb(AVCodecContext *ctx, GetBitContext *gb,
> + int block_idx, int size,
> + int block_pitch_sh2,
> + const struct frame_type_desc *frame_desc,
> + float *excitation)
> +{
> + static const float gain_coeff[6] = {
> + 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
> + };
> + WMAVoiceContext *s = ctx->priv_data;
> + float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
> + int n, idx, sh, block_pitch;
> +
> + assert(size <= MAX_FRAMESIZE / 2);
> + memset(pulses, 0, sizeof(float) * size);
> +
> + /**
> + * For the other frame types, this is where we apply the innovation
> + * (fixed) codebook pulses of the speech signal.
> + */
> + switch (frame_desc->fcb_type) {
> + case 2: ///< pitch-adapting window (AW) fixed codebook
> + aw_pulse_set1(s, gb, block_idx, block_pitch_sh2 >> 2,
> + pulses, size);
> +
> + aw_pulse_set2(s, gb, block_idx, block_pitch_sh2 >> 2,
> + pulses, size);
> + break;
> +
> + case 4: case 5: case 6: ///< innovation (fixed) codebook
> + sh = 8 - frame_desc->fcb_type; ///< 4:4/5:3/6:2
> + for (n = 0; n < 5; n++) {
> + float pulse = get_bits1(gb) ? 1.0 : -1.0;
> + int idx1, idx2;
> +
> + idx1 = get_bits(gb, sh);
> + pulses[n + 5 * idx1] += pulse;
> + if (n < frame_desc->dbl_pulses) {
> + idx2 = get_bits(gb, sh);
> + pulses[n + 5 * idx2] += (idx1 >= idx2) ? pulse : -pulse;
Another AMRFixed usage case. Maybe it would be better to pass it to
aw_pulse_setx() and do the ff_set_fixed_vector() after the switch.
> + }
> + }
> + break;
> + }
> +
> + /**
> + * Calculate gain for adaptive & fixed codebook signal.
> + * @note see ff_amr_set_fixed_gain().
> + */
> + idx = get_bits(gb, 7);
> + fcb_gain = ff_wmavoice_gain_codebook_fcb[idx] *
> + expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
> + 5.2409161640);
> + acb_gain = ff_wmavoice_gain_codebook_acb[idx];
> + pred_err = logf(av_clipf(ff_wmavoice_gain_codebook_fcb[idx], 0.05, 5.0));
> +
> + if (frame_desc->n_blocks > 1)
> + memmove(&s->gain_pred_err[8 / frame_desc->n_blocks],
> + s->gain_pred_err,
> + sizeof(float) * (6 - 8 / frame_desc->n_blocks));
> + for (n = 0; n < 8 / frame_desc->n_blocks; n++)
> + s->gain_pred_err[n] = pred_err;
> +
> + /**
> + * Calculation of adaptive codebook.
> + */
> + switch (frame_desc->acb_type) {
> + case 1: ///< adaptive codebook
> + for (n = 0; n < size; n++) {
> + float v;
> + int pitch_sh8 = (s->last_pitch_val << 8) +
> + ((s->pitch_diff_sh16 * (block_idx * size + n)) >> 8);
> + int pitch = (pitch_sh8 + 0x80) >> 8,
> + idx = (((pitch << 8) - pitch_sh8) * 8 + 0x480) >> 8, m;
> + pitch -= idx >> 3;
> + idx &= 7;
> + for (v = 0., m = 16; m >= 0; m--)
> + v += ff_wmavoice_ipol1_coeffs[idx][m] *
> + excitation[n + m - pitch - 8];
> + excitation[n] = v;
> + }
> + break;
> +
> + case 2: ///< adaptive codebook
> + block_pitch = block_pitch_sh2 >> 2, idx = block_pitch_sh2 & 3;
> + if (idx--) {
> + for (n = 0; n < size; n++) {
> + float v;
> + int m;
> +
> + for (v = 0., m = 0; m < 16; m++)
> + v += ff_wmavoice_ipol2_coeffs[idx][m] *
> + excitation[m + n - 8 - block_pitch];
> + excitation[n] = v;
> + }
> + } else
> + for (n = 0; n < size; n++)
> + excitation[n] = excitation[n - block_pitch];
> + break;
> + }
> + /**
> + * Interpolate ACB/FCB and use as excitation signal.
> + */
> + ff_weighted_vector_sumf(excitation, excitation, pulses,
> + acb_gain, fcb_gain, size);
> +}
> +
> +/**
> + * Parse data in a single block.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoding context
> + * @param gb bit I/O context
> + * @param block_idx index of the to-be-read block
> + * @param size amount of samples to be read in this block
> + * @param block_pitch_sh2 pitch for this block << 2
> + * @param lsps LSPs for (the end of) this frame
> + * @param pre_lsps LSPs for the last frame
> + * @param frame_desc frame type descriptor
> + * @param excitation target memory for the ACB+FCB interpolated signal
> + * @param synth target memory for the speech synthesis filter output
> + * @return 0 on success, <0 on error.
> + */
> +
> +static void synth_block(AVCodecContext *ctx, GetBitContext *gb,
> + int block_idx, int size,
> + int block_pitch_sh2,
> + const double *lsps, const double *prev_lsps,
> + const struct frame_type_desc *frame_desc,
> + float *excitation, float *synth)
> +{
> + WMAVoiceContext *s = ctx->priv_data;
> + double i_lsps[MAX_LSPS];
> + float lpcs[MAX_LSPS];
> + float fac;
> + int n;
> +
> + if (frame_desc->acb_type == 0)
> + synth_block_hardcoded(ctx, gb, block_idx, size, frame_desc, excitation);
> + else
> + synth_block_fcb_acb(ctx, gb, block_idx, size, block_pitch_sh2,
> + frame_desc, excitation);
> +
> + /** convert interpolated LSPs to LPCs */
> + fac = (block_idx + 0.5) / frame_desc->n_blocks;
> + for (n = 0; n < s->lsps; n++) ///< LSF -> LSP
> + i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
> + ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
> +
> + /**
> + * Speech synthesis.
> + */
> + ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
> +}
> +
> +/**
> + * Synthesize output samples for a single frame.
> + * @note we assume enough bits are available, caller should check.
> + *
> + * @param ctx WMA Voice decoder context
> + * @param gb bit I/O context (s->gb or one for cross-packet superframes)
> + * @param data pointer to output sample buffer, has space for at least 160
> + * samples
> + * @param lsps LSP array
> + * @param prev_lsps array of previous frame's LSPs
> + * @return 0 on success, <0 on error.
> + */
> +static int synth_frame(AVCodecContext *ctx, GetBitContext *gb,
> + float *samples,
> + const double *lsps, const double *prev_lsps,
> + float *excitation, float *synth)
> +{
> + WMAVoiceContext *s = ctx->priv_data;
> + int n, n_blocks_x2;
> + short pitch[MAX_BLOCKS], last_block_pitch;
> +
> + /**
> + * Parse frame type ("frame header"), see #frame_descs.
> + */
> + int bd_idx = s->vbm_tree[get_vbm_bits(gb)],
> + block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
Is frame size ever different from MAX_FRAMESIZE?
> + /**
> + * Pitch (per frame type):
> + * - type 0: unused
> + * - type 1: provided (globally) for the whole frame. In #synth_block(),
> + * we derive the "pitch-per-sample" for adaptive codebook
> + * reading.
> + * - type 2: provided per block (see just before the call to
> + * #parse_block()), so not read here.
> + */
> + switch (frame_descs[bd_idx].acb_type) {
> + case 0:
> + memset(pitch, 0, sizeof(pitch[0]) * frame_descs[bd_idx].n_blocks);
> + break;
> + case 1:
> + n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
> + s->cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
> + if (s->last_acb_type == 0 ||
> + 20 * abs(s->cur_pitch_val - s->last_pitch_val) >
> + (s->cur_pitch_val + s->last_pitch_val))
> + s->last_pitch_val = s->cur_pitch_val;
> +
> + /** pitch per frame/block */
> + for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
> + int fac = ((n << 1) | 1);
> +
> + pitch[n] = (fac * s->cur_pitch_val +
> + (n_blocks_x2 - fac) * s->last_pitch_val +
> + frame_descs[bd_idx].n_blocks) / n_blocks_x2;
> + }
> +
> + /** pitch per sample */
> + s->pitch_diff_sh16 =
> + ((s->cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
> + break;
> + }
> +
> + /**
> + * Global gain (if silence) and pitch-adaptive window coordinates.
> + */
> + switch (frame_descs[bd_idx].fcb_type) {
> + case 0:
> + s->silence_gain = ff_wmavoice_gain_silence[get_bits(gb, 8)];
> + break;
> + case 2:
> + aw_parse_coords(ctx, gb, pitch);
> + break;
> + }
> +
> + for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
> + int bl_pitch_sh2 = pitch[n] << 2;
> +
> + /**
> + * If pitch is given per block, parse that first. Per-block pitches
> + * are encoded as an absolute value for the first block, and then
> + * delta values for all subsequent blocks. Unit is different (also
> + * scale is different), so we do some stupidly complex conversion.
> + */
> + if (frame_descs[bd_idx].acb_type == 2) {
> + int block_pitch,
> + t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
> + t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
> + t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
> +
> + if (n == 0) {
> + block_pitch = get_bits(gb, s->block_pitch_nbits);
> + } else
> + block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
> + get_bits(gb, s->block_delta_pitch_nbits);
> + /**
> + * Convert last_ so that any next delta leads to a value within
> + * _range.
> + */
> + last_block_pitch = av_clip(block_pitch,
> + s->block_delta_pitch_hrange,
> + s->block_pitch_range -
> + s->block_delta_pitch_hrange);
> +
> + /**
> + * Convert semi-log-style scale back to normal scale.
> + */
> + if (block_pitch < t1) {
> + bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
> + } else {
> + block_pitch -= t1;
> + if (block_pitch < t2) {
> + bl_pitch_sh2 =
> + (s->block_conv_table[1] << 2) + (block_pitch << 1);
> + } else {
> + block_pitch -= t2;
> + if (block_pitch < t3) {
> + bl_pitch_sh2 =
> + (s->block_conv_table[2] + block_pitch) << 2;
> + } else
> + bl_pitch_sh2 = s->block_conv_table[3] << 2;
> + }
> + }
> + pitch[n] = bl_pitch_sh2 >> 2;
> + }
> +
> + synth_block(ctx, gb, n, block_nsamples, bl_pitch_sh2,
> + lsps, prev_lsps, &frame_descs[bd_idx],
> + &excitation[n * block_nsamples],
> + &synth[n * block_nsamples]);
> + }
> +
> + /**
> + * Averaging projection filter, if applicable. Else, just copy samples
> + * from synthesis buffer.
> + */
> + if (s->do_apf) {
> + // FIXME this is where APF would take place, currently not implemented
> + av_log_missing_feature(ctx, "APF", 0);
> + s->do_apf = 0;
> + } //else
> + for (n = 0; n < 160; n++)
> + samples[n] = av_clipf(synth[n], -1.0, 1.0);
> +
> + /**
> + * Cache values for next frame.
> + */
> + s->frame_cntr = (s->frame_cntr + 1) % 0xFFFF;
> + s->last_acb_type = frame_descs[bd_idx].acb_type;
> + switch (frame_descs[bd_idx].acb_type) {
> + case 0: s->last_pitch_val = 0; break;
> + case 1: s->last_pitch_val = s->cur_pitch_val; break;
> + case 2: s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1]; break;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * Ensure minimum value for first item, maximum value for last value,
> + * proper spacing between each value and proper ordering.
> + *
> + * @param lsps array of LSPs
> + * @param num size of @lsps array
> + *
> + * @note basically a double version of ff_acelp_reorder_lsf(), might be
> + * useful to put in a generic location later on. Parts are also
> + * present in ff_set_min_dist_lsf() + ff_sort_nearly_sorted_floats(),
> + * which is in float.
> + */
> +static void stabilize_lsps(double *lsps, int num)
> +{
> + int n, m, l;
> +
> + /** set minimum value for first, maximum value for last and minimum
> + * spacing between LSF values.
> + * @note very similar to ff_set_min_dist_lsf(), but in double. */
> + lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
> + for (n = 1; n < num; n++)
> + lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
> + lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
> +
> + /** reorder (looks like one-time / non-recursed bubblesort)
> + * @note very similar to ff_sort_nearly_sorted_floats(), but in double */
> + for (n = 1; n < num; n++) {
> + if (lsps[n] < lsps[n - 1]) {
> + for (m = 1; m < num; m++) {
> + double tmp = lsps[m];
> + for (l = m - 1; l >= 0; l--) {
> + if (lsps[l] <= tmp) break;
> + lsps[l + 1] = lsps[l];
> + }
> + lsps[l + 1] = tmp;
> + }
> + break;
> + }
> + }
> +}
> +
> +/**
> + * Test if there's enough bits to read 1 superframe.
> + *
> + * @param orig_gb bit I/O context used for reading. This function
> + * does not modify the state of the bitreader; it
> + * only uses it to copy the current stream position
> + * @param s WMA Voice decoding context private data
> + * @returns -1 if unsupported, 1 on not enough bits or 0 if OK.
> + */
> +static int check_bits_for_superframe(GetBitContext *orig_gb,
> + WMAVoiceContext *s)
> +{
> + GetBitContext s_gb, *gb = &s_gb;
> + int n, need_bits, bd_idx;
> + const struct frame_type_desc *frame_desc;
> +
> + /* initialize a copy */
> + init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
> + skip_bits_long(gb, get_bits_count(orig_gb));
> + assert(get_bits_left(gb) == get_bits_left(orig_gb));
> +
> + /* superframe header */
> + if (get_bits_left(gb) < 14)
> + return 1;
> + if (!get_bits1(gb))
> + return -1; ///< WMAPro-in-WMAVoice superframe
> + if (get_bits1(gb)) skip_bits(gb, 12); ///< number of samples in superframe
> + if (s->has_residual_lsps) { ///< residual LSPs (for all frames)
> + if (get_bits_left(gb) < s->sframe_lsp_bitsize)
> + return 1;
> + skip_bits_long(gb, s->sframe_lsp_bitsize);
> + }
> +
> + /* frames */
> + for (n = 0; n < MAX_FRAMES; n++) {
> + int aw_idx_is_ext = 0;
> +
> + if (!s->has_residual_lsps) { ///< independent LSPs (per-frame)
> + if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
> + skip_bits_long(gb, s->frame_lsp_bitsize);
> + }
> + bd_idx = s->vbm_tree[get_vbm_bits(gb)]; ///< frame type ("header")
> + frame_desc = &frame_descs[bd_idx];
> + if (frame_desc->acb_type == 1) {
> + if (get_bits_left(gb) < s->pitch_nbits)
> + return 1;
> + skip_bits_long(gb, s->pitch_nbits);
> + }
> + if (frame_desc->fcb_type == 0) {
> + skip_bits(gb, 8);
> + } else if (frame_desc->fcb_type == 2) {
> + int tmp = get_bits(gb, 6);
> + if (tmp >= 0x36) {
> + skip_bits(gb, 2);
> + aw_idx_is_ext = 1;
> + }
> + }
> +
> + /* blocks */
> + if (frame_desc->acb_type == 2) {
> + need_bits = s->block_pitch_nbits +
> + (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
> + } else if (frame_desc->fcb_type == 2) {
> + need_bits = 2 * !aw_idx_is_ext;
> + } else
> + need_bits = 0;
> + need_bits += frame_desc->frame_size;
> + if (get_bits_left(gb) < need_bits)
> + return 1;
> + skip_bits_long(gb, need_bits);
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * Synthesize output samples for a single superframe. If we have any data
> + * cached in s->sframe_cache, that will be used instead of whatever is loaded
> + * in s->gb.
> + *
> + * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
> + * to give a total of 480 samples per frame. See #synth_frame() for frame
> + * parsing. In addition to 3 frames, superframes can also contain the LSPs
> + * (if these are globally specified for all frames (residually); they can
> + * also be specified individually per-frame. See the s->has_residual_lsps
> + * option), and can specify the number of samples encoded in this superframe
> + * (if less than 480), usually used to prevent blanks at track boundaries.
> + *
> + * @param s WMA Voice decoder context
> + * @param data pointer to output buffer for voice samples
> + * @param data_size pointer containing the size of @data on input, and the
> + * amount of @data filled on output
> + * @return 0 on success, <0 on error or 1 if there was not enough data to
> + * fully parse the superframe
> + */
> +static int synth_superframe(AVCodecContext *ctx,
> + float *samples, int *data_size)
> +{
> + WMAVoiceContext *s = ctx->priv_data;
> + GetBitContext *gb = &s->gb, s_gb;
> + int n, res, n_samples = 480;
> + double lsps[MAX_FRAMES][MAX_LSPS];
> + const double *mean_lsf = s->lsps == 16 ?
> + ff_wmavoice_mean_lsf16[s->lsp_def_mode] : ff_wmavoice_mean_lsf10[s->lsp_def_mode];
> + float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
> + float synth[MAX_LSPS + MAX_SFRAMESIZE];
> +
> + memcpy(synth, s->synth_history,
> + s->lsps * sizeof(float));
> + memcpy(excitation, s->excitation_history,
> + s->history_nsamples * sizeof(float));
I slightly prefer (one memcpy less) instead of
memcpy(synth, s->synth_hist, mem_size);
decode(synth);
memcpy(s->synth_hist, synth + size - mem_size, mem_size);
doing
decode(s->synth);
memmove(s->synth, s->synth + size, mem_size);
> + if (s->cache_sframe_size > 0) {
> + gb = &s_gb;
> + init_get_bits(gb, s->sframe_cache, s->cache_sframe_size);
> + s->cache_sframe_size = 0;
> + }
> +
> + if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
> +
> + /**
> + * First bit is speech/music bit, it differentiates between WMAVoice
> + * speech samples (the actual codec) and WMAVoice music samples, which
> + * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
> + * the wild yet.
> + */
> + if (!get_bits1(gb)) {
> + av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
> + return -1;
> + }
> +
> + /**
> + * (optional) nr. of samples in superframe; always <= 480 and >= 0.
> + */
> + if (get_bits1(gb)) {
> + if ((n_samples = get_bits(gb, 12)) > 480) {
> + av_log(ctx, AV_LOG_ERROR,
> + "Superframe encodes >480 samples (%d), not allowed\n",
> + n_samples);
> + return -1;
> + }
> + }
> + /**
> + * Parse LSPs, if global for the whole superframe (can also be per-frame).
> + */
> + if (s->has_residual_lsps) {
> + double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
> +
> + for (n = 0; n < s->lsps; n++)
> + prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
> +
> + if (s->lsps == 10) {
> + dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
> + } else /* s->lsps == 16 */
> + dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
> +
> + for (n = 0; n < s->lsps; n++) {
> + lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
> + lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
> + lsps[2][n] += mean_lsf[n];
> + }
> + for (n = 0; n < 3; n++)
> + stabilize_lsps(lsps[n], s->lsps);
> + }
> +
> + /**
> + * Parse frames, optionally preceeded by per-frame (independent) LSPs.
> + */
> + for (n = 0; n < 3; n++) {
> + if (!s->has_residual_lsps) {
> + int m;
> +
> + if (s->lsps == 10) {
> + dequant_lsp10i(gb, lsps[n]);
> + } else /* s->lsps == 16 */
> + dequant_lsp16i(gb, lsps[n]);
> +
> + for (m = 0; m < s->lsps; m++)
> + lsps[n][m] += mean_lsf[m];
> + stabilize_lsps(lsps[n], s->lsps);
> + }
> +
> + if ((res = synth_frame(ctx, gb,
> + &samples[n * MAX_FRAMESIZE],
> + lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
> + &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
> + &synth[s->lsps + n * MAX_FRAMESIZE])))
> + return res;
> + }
> +
> + /**
> + * Statistics? FIXME - we don't check for length, a slight overrun
> + * will be caught by internal buffer padding, and anything else
> + * will be skipped, not read.
> + */
> + if (get_bits1(gb)) {
> + res = get_bits(gb, 4);
> + skip_bits(gb, 10 * (res + 1));
> + }
> +
> + /**
> + * Specify nr. of output samples.
> + */
> + *data_size = n_samples * sizeof(float);
> +
> + /**
> + * Update history.
> + */
> + memcpy(s->prev_lsps, lsps[2],
> + s->lsps * sizeof(double));
> + memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
> + s->lsps * sizeof(float));
> + memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
> + s->history_nsamples * sizeof(float));
> +
> + return 0;
> +}
> +
> +/**
> + * Parse the packet header at the start of each packet (input data to this
> + * decoder).
> + *
> + * @param s WMA Voice decoding context private data
> + * @returns 1 if not enough bits were available, or 0 on success.
> + */
> +static int parse_packet_header(WMAVoiceContext *s)
> +{
> + GetBitContext *gb = &s->gb;
> + unsigned int res;
> +
> + if (get_bits_left(gb) < 11)
> + return 1;
Can this ever happen?
-Vitor
More information about the ffmpeg-devel
mailing list