[FFmpeg-devel] [PATCH] MPEG-4 Parametric Stereo decoder

Tue Jun 8 20:13:41 CEST 2010

On Thu, Jun 3, 2010 at 12:47 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Wed, Jun 02, 2010 at 10:28:32PM -0400, Alex Converse wrote:
> > On Mon, May 31, 2010 at 6:42 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> [...]
> > >
> > > > +{
> > > > + ? ?int b;
> > > > + ? ?int table_idx = dt ? huff_icc_dt : huff_icc_df;
> > > > + ? ?VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> > > > + ? ?if (dt) {
> > > > + ? ? ? ?int e_prev = e ? e - 1 : ps->num_env_old - 1;
> > > > + ? ? ? ?e_prev = FFMAX(e_prev, 0);
> > > > + ? ? ? ?for (b = 0; b < ps->nr_icc_par; b++) {
> > > > + ? ? ? ? ? ?ps->icc_par[e][b] = ps->icc_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 3) - huff_offset[table_idx];
> > > > + ? ? ? ? ? ?if (ps->icc_par[e][b] > 7U) {
> > > > + ? ? ? ? ? ? ? ?av_log(avctx, AV_LOG_ERROR, "illegal icc\n");
> > > > + ? ? ? ? ? ? ? ?return -1;
> > > > + ? ? ? ? ? ?}
> > > > + ? ? ? ?}
> > > > + ? ?} else {
> > > > + ? ? ? ?int prev = 0;
> > > > + ? ? ? ?for (b = 0; b < ps->nr_icc_par; b++) {
> > > > + ? ? ? ? ? ?prev += get_vlc2(gb, vlc_table, 9, 3) - huff_offset[table_idx];
> > > > + ? ? ? ? ? ?ps->icc_par[e][b] = prev;
> > > > + ? ? ? ? ? ?if (ps->icc_par[e][b] > 7U) {
> > > > + ? ? ? ? ? ? ? ?av_log(avctx, AV_LOG_ERROR, "illegal icc\n");
> > > > + ? ? ? ? ? ? ? ?return -1;
> > > > + ? ? ? ? ? ?}
> > > > + ? ? ? ?}
> > > > + ? ?}
> > >
> > > this could be simplified to:
> > > for (b = 0; b < ps->nr_icc_par; b++) {
> > > ? ?ps->icc_par[e][b]= prev[b] + get_vlc2(gb, vlc_table, 9, 3) - huff_offset[table_idx];
> > > ? ?if (ps->icc_par[e][b] > 7U) {
> > > ? ? ? ?av_log(avctx, AV_LOG_ERROR, "illegal icc\n");
> > > ? ? ? ?return -1;
> > > ? ?}
> > > }
> > > and this possibly applies to other functions as well
> > >
> >
> > Are you talking about the if case the else case or both combined?
>
> iam talking about combining the if and else by having prev be a pointer
> into par
>

That would require extra padding for the df case. It would also add
more array lookups.

>
> > why is prev dereferenced two levels but not par?
> > Am I missing something here?
> >
> >
> > > > + ? ?return 0;
> > > > +}
> > > > +
> > > > +static void ipd_data(GetBitContext *gb, PSContext *ps, int e, int dt)
> > > > +{
> > > > + ? ?int b;
> > > > + ? ?int table_idx = dt ? huff_ipd_dt : huff_ipd_df;
> > > > + ? ?VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> > > > + ? ?if (dt) {
> > > > + ? ? ? ?int e_prev = e ? e - 1 : ps->num_env_old - 1;
> > > > + ? ? ? ?e_prev = FFMAX(e_prev, 0);
> > > > + ? ? ? ?for (b = 0; b < ps->nr_ipdopd_par; b++) {
> > > > + ? ? ? ? ? ?ps->ipd_par[e][b] = (ps->ipd_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 1)) & 0x07;
> > > > + ? ? ? ?}
> > > > + ? ?} else {
> > > > + ? ? ? ?int prev = 0;
> > > > + ? ? ? ?for (b = 0; b < ps->nr_ipdopd_par; b++) {
> > > > + ? ? ? ? ? ?prev += get_vlc2(gb, vlc_table, 9, 3);
> > > > + ? ? ? ? ? ?prev &= 0x07;
> > > > + ? ? ? ? ? ?ps->ipd_par[e][b] = prev;
> > > > + ? ? ? ?}
> > > > + ? ?}
> > > > +}
> > > > +
> > > > +static void opd_data(GetBitContext *gb, PSContext *ps, int e, int dt)
> > > > +{
> > > > + ? ?int b;
> > > > + ? ?int table_idx = dt ? huff_opd_dt : huff_opd_df;
> > > > + ? ?VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> > > > + ? ?if (dt) {
> > > > + ? ? ? ?int e_prev = e ? e - 1 : ps->num_env_old - 1;
> > > > + ? ? ? ?e_prev = FFMAX(e_prev, 0);
> > > > + ? ? ? ?for (b = 0; b < ps->nr_ipdopd_par; b++) {
> > > > + ? ? ? ? ? ?ps->opd_par[e][b] = (ps->opd_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 1)) & 0x07;
> > > > + ? ? ? ?}
> > > > + ? ?} else {
> > > > + ? ? ? ?int prev = 0;
> > > > + ? ? ? ?for (b = 0; b < ps->nr_ipdopd_par; b++) {
> > > > + ? ? ? ? ? ?prev += get_vlc2(gb, vlc_table, 9, 3);
> > > > + ? ? ? ? ? ?prev &= 0x07;
> > > > + ? ? ? ? ? ?ps->opd_par[e][b] = prev;
> > > > + ? ? ? ?}
> > > > + ? ?}
> > > > +}
> > >
> > > all these functions look like they do pretty much the same, maybe this can be
> > > done with just a single function?
> > >
> >
> > I thought about it but they all have different valid ranges, unless
> > you just mean merging ipd/opd which both have the same circular range.
>
> why not pass the range as argument to the function?
>

The three range cases seem vastly different:

FFABS(ps->iid_par[e][b]) > 7 + 8 * ps->iid_quant
ps->icc_par[e][b] > 7U
prev &= 0x07

the last one even using circular addressing.

> > > > +
> > > > +/** Split one subband into 2 subsubbands with a real filter */
> > > > +static av_noinline void hybrid2_re(float (*in)[2], float (*out)[32][2], const float filter[7], int len, int reverse)
> > > > +{
> > > > + ? ?int i, j;
> > > > + ? ?for (i = 0; i < len; i++) {
> > > > + ? ? ? ?float re_in = filter[6] * in[6+i][0]; ? ? ? ?//real inphase
> > >
> > > 0.5 * in[6+i][0]
> >
> > If we are going to start inlining values here are we better off using
> > the g1_Q2 symbol directly in this function rather than making it a
> > parameter?
>
> you are the aac expert, i dont know if this function is intended to be ever
> used with a different array.
>

no, except for the fact that it's a fairly generic signal processing function.

> > > > + ? ? ? ? ? ?}
> > > > + ? ? ? ? ? ?temp[ssb][0] = sum_re;
> > > > + ? ? ? ? ? ?temp[ssb][1] = sum_im;
> > > > + ? ? ? ?}
> > > > + ? ? ? ?out[0][i][0] = temp[6][0];
> > > > + ? ? ? ?out[0][i][1] = temp[6][1];
> > > > + ? ? ? ?out[1][i][0] = temp[7][0];
> > > > + ? ? ? ?out[1][i][1] = temp[7][1];
> > > > + ? ? ? ?out[2][i][0] = temp[0][0];
> > > > + ? ? ? ?out[2][i][1] = temp[0][1];
> > > > + ? ? ? ?out[3][i][0] = temp[1][0];
> > > > + ? ? ? ?out[3][i][1] = temp[1][1];
> > > > + ? ? ? ?out[4][i][0] = temp[2][0] + temp[5][0];
> > > > + ? ? ? ?out[4][i][1] = temp[2][1] + temp[5][1];
> > > > + ? ? ? ?out[5][i][0] = temp[3][0] + temp[4][0];
> > > > + ? ? ? ?out[5][i][1] = temp[3][1] + temp[4][1];
> > > > + ? ?}
> > > > +}
> > > > +
> > > > +static av_noinline void hybrid4_8_12_cx(float (*in)[2], float (*out)[32][2], const float (*filter)[7][2], int N, int len)
> > > > +{
> > > > + ? ?int i, j, ssb;
> > > > +
> > > > + ? ?for (i = 0; i < len; i++) {
> > > > + ? ? ? ?for (ssb = 0; ssb < N; ssb++) {
> > > > + ? ? ? ? ? ?float sum_re = filter[ssb][6][0] * in[i+6][0], sum_im = filter[ssb][6][0] * in[i+6][1];
> > > > + ? ? ? ? ? ?for (j = 0; j < 6; j++) {
> > > > + ? ? ? ? ? ? ? ?float in0_re = in[i+j][0];
> > > > + ? ? ? ? ? ? ? ?float in0_im = in[i+j][1];
> > > > + ? ? ? ? ? ? ? ?float in1_re = in[i+12-j][0];
> > > > + ? ? ? ? ? ? ? ?float in1_im = in[i+12-j][1];
> > > > + ? ? ? ? ? ? ? ?sum_re += filter[ssb][j][0] * (in0_re + in1_re) - filter[ssb][j][1] * (in0_im - in1_im);
> > > > + ? ? ? ? ? ? ? ?sum_im += filter[ssb][j][0] * (in0_im + in1_im) + filter[ssb][j][1] * (in0_re - in1_re);
> > > > + ? ? ? ? ? ?}
> > > > + ? ? ? ? ? ?out[ssb][i][0] = sum_re;
> > > > + ? ? ? ? ? ?out[ssb][i][1] = sum_im;
> > > > + ? ? ? ?}
> > > > + ? ?}
> > > > +}
> > > > +
> > > > +static av_noinline void hybrid_analysis(float out[91][32][2], float in[64][44][2], int is34, int len)
> > > > +{
> > > > + ? ?int i;
> > > > + ? ?if(is34) {
> > > > + ? ? ? ?hybrid4_8_12_cx(in[0], out, ? ?f34_0_12, 12, len);
> > > > + ? ? ? ?hybrid4_8_12_cx(in[1], out+12, f34_1_8, ? 8, len);
> > > > + ? ? ? ?hybrid4_8_12_cx(in[2], out+20, f34_2_4, ? 4, len);
> > > > + ? ? ? ?hybrid4_8_12_cx(in[3], out+24, f34_2_4, ? 4, len);
> > > > + ? ? ? ?hybrid4_8_12_cx(in[4], out+28, f34_2_4, ? 4, len);
> > > > + ? ? ? ?for (i = 0; i < 59; i++) {
> > > > + ? ? ? ? ? ?memcpy(out[32 + i], in[5 + i]+6, len * sizeof(in[0][0]));
> > > > + ? ? ? ?}
> > > > + ? ?} else {
> > > > + ? ? ? ?hybrid6_cx(in[0], out, f20_0_8, len);
> > > > + ? ? ? ?hybrid2_re(in[1], out+6, g1_Q2, len, 1);
> > > > + ? ? ? ?hybrid2_re(in[2], out+8, g1_Q2, len, 0);
> > > > + ? ? ? ?for (i = 0; i < 61; i++) {
> > > > + ? ? ? ? ? ?memcpy(out[10 + i], in[3 + i]+6, len * sizeof(in[0][0]));
> > > > + ? ? ? ?}
> > >
> > > is all that memcpy unavoidable?
> > >
> >
> > It may be avoidable with a float(**)x[2] instead of a float(*)[][2].
> > FFmpeg seems to avoid the former.
>
> we prefer 2d fixed size arrays over pointer array due to extra dereferences
>

But are pointer matrices preferred when they avoid large memcpys?

> >
> > I was also thinking it may be prudent to merge transpose_in() with
> > hybrid_analysis() which would make the point moot, no?
>
> if that gets rid of the memcpy() its of course a good idea.
> ive not looked in depth at all functions yet.
>
>

It should.
SBR filterbanks transforms are finalized now, so that portability
layer can be squashed.

> > > > + ? ? ? ? ? ? ? ? ? ?map_idx_10_to_20(opd_mapped[e], ps->opd_par[e], 0);
> > > > + ? ? ? ? ? ? ? ? ? ?map_idx_20_to_34(opd_mapped[e], opd_mapped[e], 0);
> > > > + ? ? ? ? ? ? ? ?} else {
> > > > + ? ? ? ? ? ? ? ? ? ?ipd_mapped = ps->ipd_par;
> > > > + ? ? ? ? ? ? ? ? ? ?opd_mapped = ps->opd_par;
> > > > + ? ? ? ? ? ? ? ?}
> > > > + ? ? ? ? ? ?}
> > > > + ? ? ? ?}
> > > > + ? ? ? ?if (!ps->is34bands_old) {
> > > > + ? ? ? ? ? ?map_val_20_to_34(H11[0], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H11[1], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H12[0], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H12[1], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H21[0], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H21[1], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H22[0], 0);
> > > > + ? ? ? ? ? ?map_val_20_to_34(H22[1], 0);
> > > > + ? ? ? ? ? ?ipdopd_reset(ps->ipd_smooth, ps->opd_smooth);
> > > > + ? ? ? ?}
> > > > + ? ?} else {
> > > > + ? ? ? ?for (e = 0; e < ps->num_env; e++) {
> > > > + ? ? ? ? ? ?if (ps->nr_icc_par == 34)
> > > > + ? ? ? ? ? ? ? ?map_idx_34_to_20(icc_mapped[e], ps->icc_par[e], 1);
> > > > + ? ? ? ? ? ?else if (ps->nr_icc_par == 10)
> > > > + ? ? ? ? ? ? ? ?map_idx_10_to_20(icc_mapped[e], ps->icc_par[e], 1);
> > > > + ? ? ? ? ? ?else
> > > > + ? ? ? ? ? ? ? ?icc_mapped = ps->icc_par;
> > > > + ? ? ? ? ? ?if (ps->nr_iid_par == 34)
> > > > + ? ? ? ? ? ? ? ?map_idx_34_to_20(iid_mapped[e], ps->iid_par[e], 1);
> > > > + ? ? ? ? ? ?else if (ps->nr_iid_par == 10)
> > > > + ? ? ? ? ? ? ? ?map_idx_10_to_20(iid_mapped[e], ps->iid_par[e], 1);
> > > > + ? ? ? ? ? ?else
> > > > + ? ? ? ? ? ? ? ?iid_mapped = ps->iid_par;
> > > > + ? ? ? ? ? ?if (ps->enable_ipdopd) {
> > > > + ? ? ? ? ? ? ? ?if (ps->nr_ipdopd_par == 17) {
> > > > + ? ? ? ? ? ? ? ? ? ?map_idx_34_to_20(ipd_mapped[e], ps->ipd_par[e], 0);
> > > > + ? ? ? ? ? ? ? ? ? ?map_idx_34_to_20(opd_mapped[e], ps->opd_par[e], 0);
> > > > + ? ? ? ? ? ? ? ?} else if (ps->nr_ipdopd_par == 5) {
> > > > + ? ? ? ? ? ? ? ? ? ?map_idx_10_to_20(ipd_mapped[e], ps->ipd_par[e], 0);
> > > > + ? ? ? ? ? ? ? ? ? ?map_idx_10_to_20(opd_mapped[e], ps->opd_par[e], 0);
> > > > + ? ? ? ? ? ? ? ?} else {
> > > > + ? ? ? ? ? ? ? ? ? ?ipd_mapped = ps->ipd_par;
> > > > + ? ? ? ? ? ? ? ? ? ?opd_mapped = ps->opd_par;
> > > > + ? ? ? ? ? ? ? ?}
> > > > + ? ? ? ? ? ?}
> > > > + ? ? ? ?}
> > > > + ? ? ? ?if (ps->is34bands_old) {
> > > > + ? ? ? ? ? ?map_val_34_to_20(H11[0], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H11[1], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H12[0], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H12[1], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H21[0], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H21[1], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H22[0], 0);
> > > > + ? ? ? ? ? ?map_val_34_to_20(H22[1], 0);
> > > > + ? ? ? ? ? ?ipdopd_reset(ps->ipd_smooth, ps->opd_smooth);
> > > > + ? ? ? ?}
> > > > + ? ?}
> > > > +
> > >
> > > > + ? ?//Mixing
> > > > + ? ?for (e = 0; e < ps->num_env; e++) {
> > > > + ? ? ? ?for (b = 0; b < NR_PAR_BANDS[is34]; b++) {
> > > > + ? ? ? ? ? ?float h11, h12, h21, h22;
> > > > + ? ? ? ? ? ?h11 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][0];
> > > > + ? ? ? ? ? ?h12 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][1];
> > > > + ? ? ? ? ? ?h21 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][2];
> > > > + ? ? ? ? ? ?h22 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][3];
> > > > + ? ? ? ? ? ?if (!PS_BASELINE && ps->enable_ipdopd && b < ps->nr_ipdopd_par) {
> > > > + ? ? ? ? ? ? ? ?//The spec say says to only run this smoother when enable_ipdopd
> > > > + ? ? ? ? ? ? ? ?//is set but the reference decoder appears to run it constantly
> > > > + ? ? ? ? ? ? ? ?float h11i, h12i, h21i, h22i;
> > > > + ? ? ? ? ? ? ? ?float opd_mag, ipd_mag, ipd_adj_re, ipd_adj_im;
> > > > + ? ? ? ? ? ? ? ?float opd_re = ipdopd_cos[ps->opd_par[e][b]];
> > > > + ? ? ? ? ? ? ? ?float opd_im = ipdopd_sin[ps->opd_par[e][b]];
> > > > + ? ? ? ? ? ? ? ?float ipd_re = ipdopd_cos[ps->ipd_par[e][b]];
> > > > + ? ? ? ? ? ? ? ?float ipd_im = ipdopd_sin[ps->ipd_par[e][b]];
> > > > + ? ? ? ? ? ? ? ?float opd_im_smooth = 0.25f * opd_smooth[b][0][1] + 0.5f * opd_smooth[b][1][1] + opd_im;
> > > > + ? ? ? ? ? ? ? ?float opd_re_smooth = 0.25f * opd_smooth[b][0][0] + 0.5f * opd_smooth[b][1][0] + opd_re;
> > > > + ? ? ? ? ? ? ? ?float ipd_im_smooth = 0.25f * ipd_smooth[b][0][1] + 0.5f * ipd_smooth[b][1][1] + ipd_im;
> > > > + ? ? ? ? ? ? ? ?float ipd_re_smooth = 0.25f * ipd_smooth[b][0][0] + 0.5f * ipd_smooth[b][1][0] + ipd_re;
> > > > + ? ? ? ? ? ? ? ?opd_smooth[b][0][0] = opd_smooth[b][1][0];
> > > > + ? ? ? ? ? ? ? ?opd_smooth[b][0][1] = opd_smooth[b][1][1];
> > > > + ? ? ? ? ? ? ? ?opd_smooth[b][1][0] = opd_re;
> > > > + ? ? ? ? ? ? ? ?opd_smooth[b][1][1] = opd_im;
> > > > + ? ? ? ? ? ? ? ?ipd_smooth[b][0][0] = ipd_smooth[b][1][0];
> > > > + ? ? ? ? ? ? ? ?ipd_smooth[b][0][1] = ipd_smooth[b][1][1];
> > > > + ? ? ? ? ? ? ? ?ipd_smooth[b][1][0] = ipd_re;
> > > > + ? ? ? ? ? ? ? ?ipd_smooth[b][1][1] = ipd_im;
> > > > + ? ? ? ? ? ? ? ?opd_mag = 1 / sqrt(opd_im_smooth * opd_im_smooth + opd_re_smooth * opd_re_smooth);
> > > > + ? ? ? ? ? ? ? ?ipd_mag = 1 / sqrt(ipd_im_smooth * ipd_im_smooth + ipd_re_smooth * ipd_re_smooth);
> > > > + ? ? ? ? ? ? ? ?opd_re = opd_re_smooth * opd_mag;
> > > > + ? ? ? ? ? ? ? ?opd_im = opd_im_smooth * opd_mag;
> > > > + ? ? ? ? ? ? ? ?ipd_re = ipd_re_smooth * ipd_mag;
> > > > + ? ? ? ? ? ? ? ?ipd_im = ipd_im_smooth * ipd_mag;
> > >
> > > if i decyphered this nonsense correctly then
> > > ?pd_re/im values here can have 512 distinct values
> > > thus a LUT that is indexed by 3 ps->o/ipd_par[e][b] values could be used
> > >
> >
> > Quantized ipd/opd take the the vales of 0-7 so smoothed values
> > normalized to unit vectors would be 8x8x8x2 floats 1024 values total
> > (512 real and 512 imaginary). Do you think an LUT is worth it here? It
> > would eliminate the invsqrt.
>
> i would suspect the lut would is faster than the sqrt() but this of course
> depends on cpu cache sizes and how fast the cpu can caclulate a ^-0.5
>

ok

[...]