[FFmpeg-devel] [PATCH] MLP/TrueHD Decoder - 2nd try

Tue Jul 1 19:24:55 CEST 2008

Michael Niedermayer wrote:
> On Tue, Jul 01, 2008 at 01:55:39AM +0100, Ramiro Polla wrote:
>> Hello Michael,
> [...]
>>>>     //@{
>>>>     /** Restart header data */
>>>>     //! The sync word used at the start of the last restart header
>>>>     uint16_t    restart_sync_word[MAX_SUBSTREAMS];
>>>>
>>>>     //! The index of the first channel coded in this substream
>>>>     uint8_t     min_channel[MAX_SUBSTREAMS];
>>>>     //! The index of the last channel coded in this substream
>>>>     uint8_t     max_channel[MAX_SUBSTREAMS];
>>>>     //! The number of channels input into the rematrix stage
>>>>     uint8_t     max_matrix_channel[MAX_SUBSTREAMS];
>>>>
>>>>     //! The left shift applied to random noise in 0x31ea substreams
>>>>     uint8_t     noise_shift[MAX_SUBSTREAMS];
>>>>     //! The current seed value for the pseudorandom noise generator(s)
>>>>     uint32_t    noisegen_seed[MAX_SUBSTREAMS];
>>>>
>>>>     //! Does this substream contain extra info to check the size of VLC 
>>>> blocks?
>>>>     uint8_t     data_check_present[MAX_SUBSTREAMS];
>>>>
>>>>     //! Bitmask of which parameter sets are conveyed in a decoding 
>>>> parameter block
>>>>     uint8_t     param_presence_flags[MAX_SUBSTREAMS];
>>>> #define PARAM_BLOCKSIZE     (1 << 7)
>>>> #define PARAM_MATRIX        (1 << 6)
>>>> #define PARAM_OUTSHIFT      (1 << 5)
>>>> #define PARAM_QUANTSTEP     (1 << 4)
>>>> #define PARAM_FIR           (1 << 3)
>>>> #define PARAM_IIR           (1 << 2)
>>>> #define PARAM_HUFFOFFSET    (1 << 1)
>>>>     //@}
>>>>
>>>>     //@{
>>>>     /** Matrix data */
>>>>
>>>>     //! Number of matrices to be applied
>>>>     uint8_t     num_primitive_matrices[MAX_SUBSTREAMS];
>>>>
>>>>     //! Output channel of matrix
>>>>     uint8_t     matrix_ch[MAX_SUBSTREAMS][MAX_MATRICES];
>>>>
>>>>     //! Whether the LSBs of the matrix output are encoded in the 
>>>> bitstream
>>>>     uint8_t     lsb_bypass[MAX_SUBSTREAMS][MAX_MATRICES];
>>>>     //! Matrix coefficients, stored as 2.14 fixed point
>>>>     int32_t     
>>>> matrix_coeff[MAX_SUBSTREAMS][MAX_MATRICES][MAX_CHANNELS+2];
>>>>     //! Left shift to apply to noise values in 0x31eb substreams
>>>>     uint8_t     matrix_noise_shift[MAX_SUBSTREAMS][MAX_MATRICES];
>>>>     //@}
>>>>
>>>>     //! Left shift to apply to huffman-decoded residuals
>>>>     uint8_t     quant_step_size[MAX_SUBSTREAMS][MAX_CHANNELS];
>>>>
>>>>     //! Number of PCM samples in current audio block
>>>>     uint16_t    blocksize[MAX_SUBSTREAMS];
>>>>     //! Number of PCM samples decoded so far in this frame
>>>>     uint16_t    blockpos[MAX_SUBSTREAMS];
>>>>
>>>>     //! Left shift to apply to decoded PCM values to get final 24-bit 
>>>> output
>>>>     int8_t      output_shift[MAX_SUBSTREAMS][MAX_CHANNELS];
>>>>
>>>>     //@{
>>>>     /** Filter data. */
>>>> #define FIR 0
>>>> #define IIR 1
>>>>     //! Number of taps in filter
>>>>     uint8_t     filter_order[MAX_CHANNELS][2];
>>>>     //! Right shift to apply to output of filter
>>>>     uint8_t     filter_coeff_q[MAX_CHANNELS][2];
>>>>
>>>>     uint8_t     filter_index[MAX_CHANNELS][2];
>>>>
>>>>     int32_t     filter_coeff[MAX_CHANNELS][2][MAX_FILTER_ORDER];
>>>>     int32_t     filter_state[MAX_CHANNELS][2][MAX_FILTER_ORDER];
>>>>     //@}
>>>>
>>>>     //@{
>>>>     /** Sample data coding infomation */
>>>>     //! Offset to apply to residual values
>>>>     int16_t     huff_offset[MAX_CHANNELS];
>>>>     //! Sign/rounding corrected version of huff_offset
>>>>     int32_t     sign_huff_offset[MAX_CHANNELS];
>>>>     //! Which VLC codebook to use to read residuals
>>>>     uint8_t     codebook[MAX_CHANNELS];
>>>>     //! Size of residual suffix not encoded using VLC
>>>>     uint8_t     huff_lsbs[MAX_CHANNELS];
>>>>     //@}
>>>>
>>>>     //! Running XOR of all output samples
>>>>     int32_t     lossless_check_data[MAX_SUBSTREAMS];
>>>>
>>>>     int8_t      noise_buffer[MAX_BLOCKSIZE_POW2];
>>>>     int8_t      bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS];
>>>>     int32_t     sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS+2];
>>>> } MLPDecodeContext;
>>> There are so many blah[substr] all over the place, maybe putting these
>>> in a SubStream struct would simplify the code?
>> What kind of simplifications did you have in mind? The result of a half-job 
>> looked uglier to me. Attached substr_struct.diff patch is what I started 
>> doing.
> 
> Well, my idea was more like:
> 
> -m->min_channel       [substr] = get_bits(gbp, 4);
> -m->max_channel       [substr] = get_bits(gbp, 4);
> -m->max_matrix_channel[substr] = get_bits(gbp, 4);
> +s->min_channel       = get_bits(gbp, 4);
> +s->max_channel       = get_bits(gbp, 4);
> +s->max_matrix_channel= get_bits(gbp, 4);
> 
> Of course if this doesnt work out and ends up messy, slow or otherwise
> has issues then it shouldnt be done ...

Much cleaner and a bit faster too.

> [...]
>>> [...]
>>>> /** Generate a PCM sample using the prediction filters and a residual 
>>>> value
>>>>  *  read from the data stream, and update the filter state.
>>>>  */
>>>>
>>>> static int filter_sample(MLPDecodeContext *m, unsigned int substr,
>>>>                          unsigned int channel, int32_t residual)
>>>> {
>>>>     unsigned int i, j, index;
>>>>     int64_t accum = 0;
>>>>     int32_t result;
>>>>
>>>>     /* TODO: Move this code to DSPContext? */
>>>>
>>>> #define INDEX(channel, order, pos) \
>>>>     ((m->filter_index[channel][order] + (pos)) & (MAX_FILTER_ORDER - 1))
>>>>
>>>>     for (j = 0; j < 2; j++)
>>>>         for (i = 0; i < m->filter_order[channel][j]; i++)
>>>>             accum += 
>>>> (int64_t)m->filter_state[channel][j][INDEX(channel,j,i)] *
>>>>                      m->filter_coeff[channel][j][i];
>>>>
>>>>     accum = accum >> m->filter_coeff_q[channel][FIR];
>>>>     result = (accum + residual)
>>>>                 & ~((1 << m->quant_step_size[substr][channel]) - 1);
>>>>
>>>>     index = INDEX(channel, FIR, -1);
>>>>     m->filter_state[channel][FIR][index] = result;
>>>>     m->filter_index[channel][FIR] = index;
>>>>
>>>>     index = INDEX(channel, IIR, -1);
>>>>     m->filter_state[channel][IIR][index] = result - accum;
>>>>     m->filter_index[channel][IIR] = index;
>>>>
>>>> #undef INDEX
>>>>
>>>>     return result;
>>>> }
>>> Why the complicated indexing?
>>> how big would filter_state be if it where a normal flat array? 160 
>>> entries?
>> It would need
>> sizeof(int32_t) * MAX_BLOCKSIZE * 2 * MAX_CHANNELS = 20480 = 20k
>> bytes more in the context, and attached filter_index_flat_array.diff patch. 
> 
>> I assume that's not ok =)
> 
> yes, but do we really need that amount?
> if the huff decode and filter loops are split (i assume thats possible) then
> we can do a
> for(channels)
>     for(sample in block)
> which would only need sizeof(int32_t) * 2 * MAX_BLOCKSIZE
> and a memcpy() or 2 for a reading writing the state

Done this way and got some speedup. But for now I leave 
filter_state_buffer in the context. Should I move it to the stack as in 
attached filter_state_buffer_stack.diff patch?

> [...]
>>>>     for (i = 0; i < m->blocksize[substr]; i++) {
>>>>         for (mat = 0; mat < m->num_primitive_matrices[substr]; mat++)
>>>>             if (m->lsb_bypass[substr][mat])
>>>>                 m->bypassed_lsbs[i + m->blockpos[substr]][mat] = 
>>>> get_bits1(gbp);
>>>>
>>>>         for (ch = m->min_channel[substr]; ch <= m->max_channel[substr]; 
>>>> ch++) {
>>>>             int32_t sample;
>>>>
>>>>             if (read_huff(m, gbp, substr, ch, &sample) < 0)
>>>>                 return -1;
>>> please use the return value instead of passing pointers to it
>> sample may be positive or negative, so a simple < check won't help. But 
>> sample must not be bigger than 24 bits.
>>
>> Is attached read_huff_return_bad_code.diff patch ok?
> 
> id use INT32_MAX besides this yes its ok

Applied, but ended up not being needed anymore. I moved the loops that 
called read_huff and filter_sample in the functions. This also sped some 
things up.

>>> [...]
>>>>         for (i = 0; i < m->blockpos[substr]; i++) {
>>>>             int64_t accum = 0;
>>>>             for (src_ch = 0; src_ch <= maxchan; src_ch++) {
>>>>                 accum += (int64_t)m->sample_buffer[i][src_ch]
>>>>                                   * m->matrix_coeff[substr][mat][src_ch];
>>>>             }
>>>>             if (m->matrix_noise_shift[substr][mat]) {
>>>>                 uint32_t index = m->num_primitive_matrices[substr] - mat;
>>>>                 index = (i * (index * 2 + 1) + index) & 
>>>> (m->access_unit_size_pow2 - 1);
>>>>                 accum += m->noise_buffer[index] << 
>>>> (m->matrix_noise_shift[substr][mat] + 7);
>>>>             }
>>>>             m->sample_buffer[i][dest_ch] = ((accum >> 14) & ~((1 << 
>>>> m->quant_step_size[substr][dest_ch]) - 1))
>>>>                                              + m->bypassed_lsbs[i][mat];
>>>>         }
>>> following may be faster ...
>>> index = m->num_primitive_matrices[substr] - mat;
>>> index2= 2*index+1;
>>> for (i = 0; i < m->blockpos[substr]; i++) {
>>> ...
>>>     if (m->matrix_noise_shift[substr][mat]) {
>>>         index &= m->access_unit_size_pow2 - 1;
>>>         accum += m->noise_buffer[index] << 
>>> (m->matrix_noise_shift[substr][mat] + 7);
>>>         index += index2;
>>>     }
>> I don't have any samples that touch this code. I'm still waiting for Ian to 
>> send me more samples (and I'm searching for some more also).
>>
>> Do you prefer if I go ahead and change it anyways?
> 
> hmm do what you prefer

I'll wait for new samples then.

Ramiro Polla
-------------- next part --------------
A non-text attachment was scrubbed...
Name: filter_state_buffer_stack.diff
Type: text/x-diff
Size: 2083 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080701/b5bd8485/attachment.diff>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mlpdec.c
Type: text/x-csrc
Size: 39751 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080701/b5bd8485/attachment.c>