[FFmpeg-devel] [PATCH] MPEG-4 Parametric Stereo decoder
Michael Niedermayer
michaelni
Tue Jun 1 00:42:08 CEST 2010
On Thu, May 20, 2010 at 01:51:41PM -0400, Alex Converse wrote:
> Yes there are some places where it can be further optimized but I'm
> losing motivation on this quickly so perhaps some review will respark
> my interest.
[...]
> +static int iid_data(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int e, int dt)
> +{
> + int b;
> + int table_idx = huff_iid[2*dt+ps->iid_quant];
> + VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> + if (dt) {
> + int e_prev = e ? e - 1 : ps->num_env_old - 1;
> + e_prev = FFMAX(e_prev, 0);
> + for (b = 0; b < ps->nr_iid_par; b++) {
> + ps->iid_par[e][b] = ps->iid_par[e_prev][b] +
> + get_vlc2(gb, vlc_table, 9, 3) -
> + huff_offset[table_idx];
> + if (FFABS(ps->iid_par[e][b]) > 7 + 8 * ps->iid_quant) {
> + av_log(avctx, AV_LOG_ERROR, "illegal iid\n");
> + return -1;
> + }
> + }
> + } else {
> + int prev = 0;
> + for (b = 0; b < ps->nr_iid_par; b++) {
> + prev += get_vlc2(gb, vlc_table, 9, 3) -
> + huff_offset[table_idx];
> + ps->iid_par[e][b] = prev;
> + if (FFABS(ps->iid_par[e][b]) > 7 + 8 * ps->iid_quant) {
> + av_log(avctx, AV_LOG_ERROR, "illegal iid\n");
> + return -1;
> + }
> + }
> + }
> + return 0;
> +}
> +
> +static int icc_data(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps, int e, int dt)
this should have some doxy that explains what icc is and explains what
e and dt are
> +{
> + int b;
> + int table_idx = dt ? huff_icc_dt : huff_icc_df;
> + VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> + if (dt) {
> + int e_prev = e ? e - 1 : ps->num_env_old - 1;
> + e_prev = FFMAX(e_prev, 0);
> + for (b = 0; b < ps->nr_icc_par; b++) {
> + ps->icc_par[e][b] = ps->icc_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 3) - huff_offset[table_idx];
> + if (ps->icc_par[e][b] > 7U) {
> + av_log(avctx, AV_LOG_ERROR, "illegal icc\n");
> + return -1;
> + }
> + }
> + } else {
> + int prev = 0;
> + for (b = 0; b < ps->nr_icc_par; b++) {
> + prev += get_vlc2(gb, vlc_table, 9, 3) - huff_offset[table_idx];
> + ps->icc_par[e][b] = prev;
> + if (ps->icc_par[e][b] > 7U) {
> + av_log(avctx, AV_LOG_ERROR, "illegal icc\n");
> + return -1;
> + }
> + }
> + }
this could be simplified to:
for (b = 0; b < ps->nr_icc_par; b++) {
ps->icc_par[e][b]= prev[b] + get_vlc2(gb, vlc_table, 9, 3) - huff_offset[table_idx];
if (ps->icc_par[e][b] > 7U) {
av_log(avctx, AV_LOG_ERROR, "illegal icc\n");
return -1;
}
}
and this possibly applies to other functions as well
> + return 0;
> +}
> +
> +static void ipd_data(GetBitContext *gb, PSContext *ps, int e, int dt)
> +{
> + int b;
> + int table_idx = dt ? huff_ipd_dt : huff_ipd_df;
> + VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> + if (dt) {
> + int e_prev = e ? e - 1 : ps->num_env_old - 1;
> + e_prev = FFMAX(e_prev, 0);
> + for (b = 0; b < ps->nr_ipdopd_par; b++) {
> + ps->ipd_par[e][b] = (ps->ipd_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 1)) & 0x07;
> + }
> + } else {
> + int prev = 0;
> + for (b = 0; b < ps->nr_ipdopd_par; b++) {
> + prev += get_vlc2(gb, vlc_table, 9, 3);
> + prev &= 0x07;
> + ps->ipd_par[e][b] = prev;
> + }
> + }
> +}
> +
> +static void opd_data(GetBitContext *gb, PSContext *ps, int e, int dt)
> +{
> + int b;
> + int table_idx = dt ? huff_opd_dt : huff_opd_df;
> + VLC_TYPE (*vlc_table)[2] = vlc_ps[table_idx].table;
> + if (dt) {
> + int e_prev = e ? e - 1 : ps->num_env_old - 1;
> + e_prev = FFMAX(e_prev, 0);
> + for (b = 0; b < ps->nr_ipdopd_par; b++) {
> + ps->opd_par[e][b] = (ps->opd_par[e_prev][b] + get_vlc2(gb, vlc_table, 9, 1)) & 0x07;
> + }
> + } else {
> + int prev = 0;
> + for (b = 0; b < ps->nr_ipdopd_par; b++) {
> + prev += get_vlc2(gb, vlc_table, 9, 3);
> + prev &= 0x07;
> + ps->opd_par[e][b] = prev;
> + }
> + }
> +}
all these functions look like they do pretty much the same, maybe this can be
done with just a single function?
[...]
> +int ff_ps_data(AVCodecContext *avctx, GetBitContext *gb, PSContext *ps)
> +{
> + int e;
> + int bit_count_start = get_bits_count(gb);
> + int header;
> +
> + header = get_bits1(gb);
> + if (header) { //enable_ps_header
> + ps->enable_iid = get_bits1(gb);
> + if (ps->enable_iid) {
> + ps->iid_mode = get_bits(gb, 3);
> + if (ps->iid_mode > 5) {
> + av_log(avctx, AV_LOG_ERROR, "iid_mode %d is reserved.\n",
> + ps->iid_mode);
> + return -1;
> + }
> + ps->nr_iid_par = nr_iidicc_par_tab[ps->iid_mode];
> + ps->iid_quant = ps->iid_mode > 2;
> + ps->nr_ipdopd_par = nr_iidopd_par_tab[ps->iid_mode];
> + }
> + ps->enable_icc = get_bits1(gb);
> + if (ps->enable_icc) {
> + ps->icc_mode = get_bits(gb, 3);
> + if (ps->icc_mode > 5) {
> + av_log(avctx, AV_LOG_ERROR, "icc_mode %d is reserved.\n",
> + ps->icc_mode);
> + return -1;
> + }
> + ps->nr_icc_par = nr_iidicc_par_tab[ps->icc_mode];
> + }
> + ps->enable_ext = get_bits1(gb);
> + }
> +
> + ps->frame_class = get_bits1(gb);
> + ps->num_env_old = ps->num_env;
> + ps->num_env = num_env_tab[ps->frame_class][get_bits(gb, 2)];
> +
> + ps->border_position[0] = -1;
> + if (ps->frame_class) {
> + for (e = 1; e <= ps->num_env; e++)
> + ps->border_position[e] = get_bits(gb, 5);
> + } else
> + for (e = 1; e <= ps->num_env; e++)
> + ps->border_position[e] = e * numQMFSlots / ps->num_env - 1;
> +
> + if (ps->enable_iid)
> + for (e = 0; e < ps->num_env; e++) {
> + int dt = get_bits1(gb);
> + if (iid_data(avctx, gb, ps, e, dt))
> + return -1;
> + }
> + else
i think that woudld benrfit from a {}
> + memset(ps->iid_par, 0, sizeof(ps->iid_par));
> +
> + if (ps->enable_icc)
> + for (e = 0; e < ps->num_env; e++) {
> + int dt = get_bits1(gb);
> + if (icc_data(avctx, gb, ps, e, dt))
> + return -1;
> + }
> + else
> + memset(ps->icc_par, 0, sizeof(ps->icc_par));
> +
> + if (ps->enable_ext) {
> + int cnt = get_bits(gb, 4);
> + if (cnt == 15) {
> + cnt += get_bits(gb, 8);
> + }
> + cnt *= 8;
> + while (cnt > 7) {
> + int ps_extension_id = get_bits(gb, 2);
> + cnt -= 2 + ps_extension(gb, ps, ps_extension_id);
> + }
> + if (cnt < 0) {
> + av_log(avctx, AV_LOG_ERROR, "ps extension overflow %d", cnt);
> + return -1;
> + }
> + skip_bits(gb, cnt);
> + }
> +
> + ps->enable_ipdopd &= !PS_BASELINE;
> +
> + //Fix up envelopes
> + if (!ps->num_env) {
> + ps->num_env = 1;
> + ps->border_position[1] = 31;
> + if (ps->enable_iid && ps->num_env_old > 1) {
> + memcpy(ps->iid_par, ps->iid_par+ps->num_env_old-1, sizeof(ps->iid_par[0]));
> + }
> + if (ps->enable_icc && ps->num_env_old > 1) {
> + memcpy(ps->icc_par, ps->icc_par+ps->num_env_old-1, sizeof(ps->icc_par[0]));
> + }
> + if (ps->enable_ipdopd && ps->num_env_old > 1) {
> + memcpy(ps->ipd_par, ps->ipd_par+ps->num_env_old-1, sizeof(ps->ipd_par[0]));
> + memcpy(ps->opd_par, ps->opd_par+ps->num_env_old-1, sizeof(ps->opd_par[0]));
> + }
> + } else if (ps->border_position[ps->num_env] < numQMFSlots - 1) {
> + //Create a fake envelope
> + if (ps->enable_iid && ps->num_env_old > 1) {
> + memcpy(ps->iid_par+ps->num_env, ps->iid_par+ps->num_env-1, sizeof(ps->iid_par[0]));
> + }
> + if (ps->enable_icc && ps->num_env_old > 1) {
> + memcpy(ps->icc_par+ps->num_env, ps->icc_par+ps->num_env-1, sizeof(ps->icc_par[0]));
> + }
> + if (ps->enable_ipdopd) {
this if() differes from the others, intended or bug?
> + memcpy(ps->ipd_par+ps->num_env, ps->ipd_par+ps->num_env-1, sizeof(ps->ipd_par[0]));
> + memcpy(ps->opd_par+ps->num_env, ps->opd_par+ps->num_env-1, sizeof(ps->opd_par[0]));
> + }
> + ps->num_env++;
> + ps->border_position[ps->num_env] = numQMFSlots - 1;
> + }
also it appears this could maybe be factoreized toward someting like:
if (ps->enable_iid && ps->num_env_old > 1) {
memcpy(ps->iid_par+ps->num_env, ps->iid_par+X, sizeof(ps->iid_par[0]));
}
if (ps->enable_icc && ps->num_env_old > 1) {
memcpy(ps->icc_par+ps->num_env, ps->icc_par+X, sizeof(ps->icc_par[0]));
}
if (ps->enable_ipdopd && ps->num_env_old > 1) {
memcpy(ps->ipd_par+ps->num_env, ps->ipd_par+X, sizeof(ps->ipd_par[0]));
memcpy(ps->opd_par+ps->num_env, ps->opd_par+X, sizeof(ps->opd_par[0]));
}
ps->num_env++;
[...]
> +
> +/** Split one subband into 2 subsubbands with a real filter */
> +static av_noinline void hybrid2_re(float (*in)[2], float (*out)[32][2], const float filter[7], int len, int reverse)
> +{
> + int i, j;
> + for (i = 0; i < len; i++) {
> + float re_in = filter[6] * in[6+i][0]; //real inphase
0.5 * in[6+i][0]
> + float re_op = 0.0f; //real out of phase
> + float im_in = filter[6] * in[6+i][1]; //imag inphase
> + float im_op = 0.0f; //imag out of phase
> + for (j = 0; j < 6; j += 2) {
> + re_in += filter[j ] * (in[i+j ][0] + in[12-j +i][0]);
> + im_in += filter[j ] * (in[i+j ][1] + in[12-j +i][1]);
these are always 0 * i think
> + re_op += filter[j+1] * (in[i+j+1][0] + in[12-j-1+i][0]);
> + im_op += filter[j+1] * (in[i+j+1][1] + in[12-j-1+i][1]);
> + }
> + out[ reverse][i][0] = re_in + re_op;
> + out[ reverse][i][1] = im_in + im_op;
> + out[1-reverse][i][0] = re_in - re_op;
> + out[1-reverse][i][1] = im_in - im_op;
> + }
> +}
> +
> +/** Split one subband into 6 subsubbands with a complex filter */
what kind of filter?
is this a standard filter of some kind, dct? mdct? based
> +static av_noinline void hybrid6_cx(float (*in)[2], float (*out)[32][2], const float (*filter)[7][2], int len)
> +{
> + int i, j, ssb;
> + int N = 8;
> + float temp[8][2];
> +
> + for (i = 0; i < len; i++) {
> + for (ssb = 0; ssb < N; ssb++) {
> + float sum_re = filter[ssb][6][0] * in[i+6][0], sum_im = filter[ssb][6][0] * in[i+6][1];
> + for (j = 0; j < 6; j++) {
> + float in0_re = in[i+j][0];
> + float in0_im = in[i+j][1];
> + float in1_re = in[i+12-j][0];
> + float in1_im = in[i+12-j][1];
> + sum_re += filter[ssb][j][0] * (in0_re + in1_re) - filter[ssb][j][1] * (in0_im - in1_im);
> + sum_im += filter[ssb][j][0] * (in0_im + in1_im) + filter[ssb][j][1] * (in0_re - in1_re);
the filter values with even j look optimizeable (0.0 or 2 equal values or such)
similar things are true for the remaining filter
> + }
> + temp[ssb][0] = sum_re;
> + temp[ssb][1] = sum_im;
> + }
> + out[0][i][0] = temp[6][0];
> + out[0][i][1] = temp[6][1];
> + out[1][i][0] = temp[7][0];
> + out[1][i][1] = temp[7][1];
> + out[2][i][0] = temp[0][0];
> + out[2][i][1] = temp[0][1];
> + out[3][i][0] = temp[1][0];
> + out[3][i][1] = temp[1][1];
> + out[4][i][0] = temp[2][0] + temp[5][0];
> + out[4][i][1] = temp[2][1] + temp[5][1];
> + out[5][i][0] = temp[3][0] + temp[4][0];
> + out[5][i][1] = temp[3][1] + temp[4][1];
> + }
> +}
> +
> +static av_noinline void hybrid4_8_12_cx(float (*in)[2], float (*out)[32][2], const float (*filter)[7][2], int N, int len)
> +{
> + int i, j, ssb;
> +
> + for (i = 0; i < len; i++) {
> + for (ssb = 0; ssb < N; ssb++) {
> + float sum_re = filter[ssb][6][0] * in[i+6][0], sum_im = filter[ssb][6][0] * in[i+6][1];
> + for (j = 0; j < 6; j++) {
> + float in0_re = in[i+j][0];
> + float in0_im = in[i+j][1];
> + float in1_re = in[i+12-j][0];
> + float in1_im = in[i+12-j][1];
> + sum_re += filter[ssb][j][0] * (in0_re + in1_re) - filter[ssb][j][1] * (in0_im - in1_im);
> + sum_im += filter[ssb][j][0] * (in0_im + in1_im) + filter[ssb][j][1] * (in0_re - in1_re);
> + }
> + out[ssb][i][0] = sum_re;
> + out[ssb][i][1] = sum_im;
> + }
> + }
> +}
> +
> +static av_noinline void hybrid_analysis(float out[91][32][2], float in[64][44][2], int is34, int len)
> +{
> + int i;
> + if(is34) {
> + hybrid4_8_12_cx(in[0], out, f34_0_12, 12, len);
> + hybrid4_8_12_cx(in[1], out+12, f34_1_8, 8, len);
> + hybrid4_8_12_cx(in[2], out+20, f34_2_4, 4, len);
> + hybrid4_8_12_cx(in[3], out+24, f34_2_4, 4, len);
> + hybrid4_8_12_cx(in[4], out+28, f34_2_4, 4, len);
> + for (i = 0; i < 59; i++) {
> + memcpy(out[32 + i], in[5 + i]+6, len * sizeof(in[0][0]));
> + }
> + } else {
> + hybrid6_cx(in[0], out, f20_0_8, len);
> + hybrid2_re(in[1], out+6, g1_Q2, len, 1);
> + hybrid2_re(in[2], out+8, g1_Q2, len, 0);
> + for (i = 0; i < 61; i++) {
> + memcpy(out[10 + i], in[3 + i]+6, len * sizeof(in[0][0]));
> + }
is all that memcpy unavoidable?
[...]
> +#define MAP_GENERIC_34_TO_20(out, in, full) \
> + out[ 0] = (2*in[ 0] + in[ 1]) / 3; \
> + out[ 1] = ( in[ 1] + 2*in[ 2]) / 3; \
> + out[ 2] = (2*in[ 3] + in[ 4]) / 3; \
> + out[ 3] = ( in[ 4] + 2*in[ 5]) / 3; \
> + out[ 4] = ( in[ 6] + in[ 7]) / 2; \
for the idx (int8) case the /2 and /3 could be done by LUT
for float, multiply by inverse seems better
> + out[ 5] = ( in[ 8] + in[ 9]) / 2; \
> + out[ 6] = in[10]; \
> + out[ 7] = in[11]; \
> + out[ 8] = ( in[12] + in[13]) / 2; \
> + out[ 9] = ( in[14] + in[15]) / 2; \
> + out[10] = in[16]; \
> + if (full) { \
> + out[11] = in[17]; \
> + out[12] = in[18]; \
> + out[13] = in[19]; \
> + out[14] = ( in[20] + in[21]) / 2; \
> + out[15] = ( in[22] + in[23]) / 2; \
> + out[16] = ( in[24] + in[25]) / 2; \
> + out[17] = ( in[26] + in[27]) / 2; \
> + out[18] = ( in[28] + in[29] + in[30] + in[31]) / 4; \
> + out[19] = ( in[32] + in[33]) / 2; \
> + }
the indention looks a bit confusing
[...]
> +static av_noinline void decorrelation(PSContext *ps, float (*out)[32][2], const float (*s)[32][2], int is34)
> +{
> + float power[34][PS_QMF_TIME_SLOTS];
> + float transient_gain[34][PS_QMF_TIME_SLOTS];
> + float *peak_decay_nrg = ps->peak_decay_nrg;
> + float *power_smooth = ps->power_smooth;
> + float *peak_decay_diff_smooth = ps->peak_decay_diff_smooth;
> + float (*delay)[PS_QMF_TIME_SLOTS + PS_MAX_DELAY][2] = ps->delay;
> + float (*ap_delay)[PS_AP_LINKS][PS_QMF_TIME_SLOTS + PS_MAX_AP_DELAY][2] = ps->ap_delay;
> + const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
> + const float peak_decay_factor = 0.76592833836465f;
> + const float transient_impact = 1.5f;
> + const float a_smooth = 0.25f; //< Smoothing coefficient
> + int i, k, m, n;
> + int n0 = 0, nL = 32;
> + static const int link_delay[] = { 3, 4, 5 };
> + static const float a[] = { 0.65143905753106f,
> + 0.56471812200776f,
> + 0.48954165955695f };
> +
> + if (is34 != ps->is34bands_old) {
> + memset(ps->peak_decay_nrg, 0, sizeof(ps->peak_decay_nrg));
> + memset(ps->power_smooth, 0, sizeof(ps->power_smooth));
> + memset(ps->peak_decay_diff_smooth, 0, sizeof(ps->peak_decay_diff_smooth));
> + memset(ps->delay, 0, sizeof(ps->delay));
> + memset(ps->ap_delay, 0, sizeof(ps->ap_delay));
> + }
> +
> + memset(power, 0, sizeof(power));
> + for (n = n0; n < nL; n++) {
> + for (k = 0; k < NR_BANDS[is34]; k++) {
> + int i = k_to_i[k];
> + power[i][n] += s[k][n][0] * s[k][n][0] + s[k][n][1] * s[k][n][1];
> + }
> + }
> +
> + //Transient detection
> + for (i = 0; i < NR_PAR_BANDS[is34]; i++) {
> + for (n = n0; n < nL; n++) {
> + float decayed_peak = peak_decay_factor * peak_decay_nrg[i];
> + peak_decay_nrg[i] = (decayed_peak < power[i][n]) ? power[i][n] : decayed_peak;
FFMAX
> + power_smooth[i] = a_smooth * power[i][n] + (1.0f - a_smooth) * power_smooth[i];
power_smooth[i] += a_smooth * (power[i][n] - power_smooth[i]);
> + peak_decay_diff_smooth[i] = a_smooth * (peak_decay_nrg[i] - power[i][n]) +
> + (1.0f - a_smooth) * peak_decay_diff_smooth[i];
something similar
> + transient_gain[i][n] = (transient_impact * peak_decay_diff_smooth[i] > power_smooth[i]) ?
> + power_smooth[i] / (transient_impact * peak_decay_diff_smooth[i]) : 1.0f;
maybe clearer with a temporaray variable
X = transient_impact * peak_decay_diff_smooth[i]
transient_gain[i][n] = X > power_smooth[i] ? power_smooth[i] / X : 1.0f; // FFMIN(power_smooth[i] / X, 1);
> +//av_log(NULL, AV_LOG_ERROR, "transient_gain[%2d][%2d] %f %f %f\n", i, n, transient_gain[i][n], peak_decay_diff_smooth[i], power_smooth[i]);
> + }
> + }
> +
> + //Decorrelation and transient reduction
> + // PS_AP_LINKS - 1
> + // -----
> + // | | Q_fract_allpass[k][m]*z^-link_delay[m] - a[m]*g_decay_slope[k]
> + //H[k][z] = z^-2 * phi_fract[k] * | | ----------------------------------------------------------------
> + // | | 1 - a[m]*g_decay_slope[k]*Q_fract_allpass[k][m]*z^-link_delay[m]
> + // m = 0
> + //d[k][z] (out) = transient_gain_mapped[k][z] * H[k][z] * s[k][z]
> + for (k = 0; k < NR_ALLPASS_BANDS[is34]; k++) {
> + int b = k_to_i[k];
> + float g_decay_slope = 1.f - DECAY_SLOPE * (k - DECAY_CUTOFF[is34]);
> + g_decay_slope = FFMIN(g_decay_slope, 1.f);
> + g_decay_slope = FFMAX(g_decay_slope, 0.f);
av_clipf
> + memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
> + memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
> + for (m = 0; m < PS_AP_LINKS; m++) {
> + memcpy(ap_delay[k][m], ap_delay[k][m]+numQMFSlots, 5*sizeof(ap_delay[k][m][0]));
> + }
> + for (n = n0; n < nL; n++) {
> + float in_re = delay[k][n+PS_MAX_DELAY-2][0] * phi_fract[is34][k][0] -
> + delay[k][n+PS_MAX_DELAY-2][1] * phi_fract[is34][k][1];
> + float in_im = delay[k][n+PS_MAX_DELAY-2][0] * phi_fract[is34][k][1] +
> + delay[k][n+PS_MAX_DELAY-2][1] * phi_fract[is34][k][0];
> + for (m = 0; m < PS_AP_LINKS; m++) {
> + float ag = a[m] * g_decay_slope;
it appears that this ag[3] can be calculated outside of the loops
> + float a_re = ag * in_re;
> + float a_im = ag * in_im;
> + float link_delay_re = ap_delay[k][m][n+5-link_delay[m]][0];
> + float link_delay_im = ap_delay[k][m][n+5-link_delay[m]][1];
> + float fractional_delay_re = Q_fract_allpass[is34][k][m][0];
> + float fractional_delay_im = Q_fract_allpass[is34][k][m][1];
> + ap_delay[k][m][n+5][0] = in_re;
> + ap_delay[k][m][n+5][1] = in_im;
> + in_re = link_delay_re * fractional_delay_re - link_delay_im * fractional_delay_im - a_re;
> + in_im = link_delay_re * fractional_delay_im + link_delay_im * fractional_delay_re - a_im;
> + ap_delay[k][m][n+5][0] += ag * in_re;
> + ap_delay[k][m][n+5][1] += ag * in_im;
> + }
> + out[k][n][0] = transient_gain[b][n] * in_re;
> + out[k][n][1] = transient_gain[b][n] * in_im;
> + }
> + }
> + for (; k < SHORT_DELAY_BAND[is34]; k++) {
> + memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
> + memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
> + for (n = n0; n < nL; n++) {
> + //H = delay 14
> + out[k][n][0] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-14][0];
> + out[k][n][1] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-14][1];
> + }
can the compiler factor transient_gain[k_to_i[k]] and delay[k] out
or does it keep these dereferences in the loop?
> + }
> + for (; k < NR_BANDS[is34]; k++) {
> + memcpy(delay[k], delay[k]+nL, PS_MAX_DELAY*sizeof(delay[k][0]));
> + memcpy(delay[k]+PS_MAX_DELAY, s[k], numQMFSlots*sizeof(delay[k][0]));
> + for (n = n0; n < nL; n++) {
> + //H = delay 1
> + out[k][n][0] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-1][0];
> + out[k][n][1] = transient_gain[k_to_i[k]][n] * delay[k][n+PS_MAX_DELAY-1][1];
> + }
> + }
> +}
> +
> +static av_noinline void stereo_processing(PSContext *ps, float (*l)[32][2], float (*r)[32][2], int is34)
> +{
> + int e, b, k, n;
> +
> + float (*H11)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H11;
> + float (*H12)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H12;
> + float (*H21)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H21;
> + float (*H22)[PS_MAX_NUM_ENV+1][PS_MAX_NR_IIDICC] = ps->H22;
> + float (*opd_smooth)[2][2] = ps->opd_smooth;
> + float (*ipd_smooth)[2][2] = ps->ipd_smooth;
> + int8_t iid_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
> + int8_t icc_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IIDICC];
> + int8_t ipd_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IPDOPD];
> + int8_t opd_mapped_buf[PS_MAX_NUM_ENV][PS_MAX_NR_IPDOPD];
> + int8_t (*iid_mapped)[PS_MAX_NR_IIDICC] = iid_mapped_buf;
> + int8_t (*icc_mapped)[PS_MAX_NR_IIDICC] = icc_mapped_buf;
> + int8_t (*ipd_mapped)[PS_MAX_NR_IPDOPD] = ipd_mapped_buf;
> + int8_t (*opd_mapped)[PS_MAX_NR_IPDOPD] = opd_mapped_buf;
> + const int8_t *k_to_i = is34 ? k_to_i_34 : k_to_i_20;
> + const float (*H_LUT)[8][4] = (PS_BASELINE || ps->icc_mode < 3) ? HA : HB;
> + static const float ipdopd_sin[] = { 0, M_SQRT1_2, 1, M_SQRT1_2, 0, -M_SQRT1_2, -1, -M_SQRT1_2 };
> + static const float ipdopd_cos[] = { 1, M_SQRT1_2, 0, -M_SQRT1_2, -1, -M_SQRT1_2, 0, M_SQRT1_2 };
> +
> + //Remapping
> + for (b = 0; b < PS_MAX_NR_IIDICC; b++) {
> + H11[0][0][b] = H11[0][ps->num_env_old][b];
> + H12[0][0][b] = H12[0][ps->num_env_old][b];
> + H21[0][0][b] = H21[0][ps->num_env_old][b];
> + H22[0][0][b] = H22[0][ps->num_env_old][b];
> + H11[1][0][b] = H11[1][ps->num_env_old][b];
> + H12[1][0][b] = H12[1][ps->num_env_old][b];
> + H21[1][0][b] = H21[1][ps->num_env_old][b];
> + H22[1][0][b] = H22[1][ps->num_env_old][b];
> + }
> + if (is34) {
> + for (e = 0; e < ps->num_env; e++) {
> + if (ps->nr_icc_par == 20)
> + map_idx_20_to_34(icc_mapped[e], ps->icc_par[e], 1);
> + else if (ps->nr_icc_par == 10) {
> + map_idx_10_to_20(icc_mapped[e], ps->icc_par[e], 1);
> + map_idx_20_to_34(icc_mapped[e], icc_mapped[e], 1);
> + } else
> + icc_mapped = ps->icc_par;
> + if (ps->nr_iid_par == 20)
> + map_idx_20_to_34(iid_mapped[e], ps->iid_par[e], 1);
> + else if (ps->nr_iid_par == 10) {
> + map_idx_10_to_20(iid_mapped[e], ps->iid_par[e], 1);
> + map_idx_20_to_34(iid_mapped[e], iid_mapped[e], 1);
> + } else
> + iid_mapped = ps->iid_par;
duplicate
and maybe the map* functions should be marked as no_inline to avoid bloating
this up imensely
> + if (ps->enable_ipdopd) {
> + if (ps->nr_ipdopd_par == 11) {
> + map_idx_20_to_34(ipd_mapped[e], ps->ipd_par[e], 0);
> + map_idx_20_to_34(opd_mapped[e], ps->opd_par[e], 0);
> + } else if (ps->nr_ipdopd_par == 5) {
> + map_idx_10_to_20(ipd_mapped[e], ps->ipd_par[e], 0);
> + map_idx_20_to_34(ipd_mapped[e], ipd_mapped[e], 0);
these 2 could be combined and optimized
> + map_idx_10_to_20(opd_mapped[e], ps->opd_par[e], 0);
> + map_idx_20_to_34(opd_mapped[e], opd_mapped[e], 0);
> + } else {
> + ipd_mapped = ps->ipd_par;
> + opd_mapped = ps->opd_par;
> + }
> + }
> + }
> + if (!ps->is34bands_old) {
> + map_val_20_to_34(H11[0], 0);
> + map_val_20_to_34(H11[1], 0);
> + map_val_20_to_34(H12[0], 0);
> + map_val_20_to_34(H12[1], 0);
> + map_val_20_to_34(H21[0], 0);
> + map_val_20_to_34(H21[1], 0);
> + map_val_20_to_34(H22[0], 0);
> + map_val_20_to_34(H22[1], 0);
> + ipdopd_reset(ps->ipd_smooth, ps->opd_smooth);
> + }
> + } else {
> + for (e = 0; e < ps->num_env; e++) {
> + if (ps->nr_icc_par == 34)
> + map_idx_34_to_20(icc_mapped[e], ps->icc_par[e], 1);
> + else if (ps->nr_icc_par == 10)
> + map_idx_10_to_20(icc_mapped[e], ps->icc_par[e], 1);
> + else
> + icc_mapped = ps->icc_par;
> + if (ps->nr_iid_par == 34)
> + map_idx_34_to_20(iid_mapped[e], ps->iid_par[e], 1);
> + else if (ps->nr_iid_par == 10)
> + map_idx_10_to_20(iid_mapped[e], ps->iid_par[e], 1);
> + else
> + iid_mapped = ps->iid_par;
> + if (ps->enable_ipdopd) {
> + if (ps->nr_ipdopd_par == 17) {
> + map_idx_34_to_20(ipd_mapped[e], ps->ipd_par[e], 0);
> + map_idx_34_to_20(opd_mapped[e], ps->opd_par[e], 0);
> + } else if (ps->nr_ipdopd_par == 5) {
> + map_idx_10_to_20(ipd_mapped[e], ps->ipd_par[e], 0);
> + map_idx_10_to_20(opd_mapped[e], ps->opd_par[e], 0);
> + } else {
> + ipd_mapped = ps->ipd_par;
> + opd_mapped = ps->opd_par;
> + }
> + }
> + }
> + if (ps->is34bands_old) {
> + map_val_34_to_20(H11[0], 0);
> + map_val_34_to_20(H11[1], 0);
> + map_val_34_to_20(H12[0], 0);
> + map_val_34_to_20(H12[1], 0);
> + map_val_34_to_20(H21[0], 0);
> + map_val_34_to_20(H21[1], 0);
> + map_val_34_to_20(H22[0], 0);
> + map_val_34_to_20(H22[1], 0);
> + ipdopd_reset(ps->ipd_smooth, ps->opd_smooth);
> + }
> + }
> +
> + //Mixing
> + for (e = 0; e < ps->num_env; e++) {
> + for (b = 0; b < NR_PAR_BANDS[is34]; b++) {
> + float h11, h12, h21, h22;
> + h11 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][0];
> + h12 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][1];
> + h21 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][2];
> + h22 = H_LUT[iid_mapped[e][b] + 7 + 23 * ps->iid_quant][icc_mapped[e][b]][3];
> + if (!PS_BASELINE && ps->enable_ipdopd && b < ps->nr_ipdopd_par) {
> + //The spec say says to only run this smoother when enable_ipdopd
> + //is set but the reference decoder appears to run it constantly
> + float h11i, h12i, h21i, h22i;
> + float opd_mag, ipd_mag, ipd_adj_re, ipd_adj_im;
> + float opd_re = ipdopd_cos[ps->opd_par[e][b]];
> + float opd_im = ipdopd_sin[ps->opd_par[e][b]];
> + float ipd_re = ipdopd_cos[ps->ipd_par[e][b]];
> + float ipd_im = ipdopd_sin[ps->ipd_par[e][b]];
> + float opd_im_smooth = 0.25f * opd_smooth[b][0][1] + 0.5f * opd_smooth[b][1][1] + opd_im;
> + float opd_re_smooth = 0.25f * opd_smooth[b][0][0] + 0.5f * opd_smooth[b][1][0] + opd_re;
> + float ipd_im_smooth = 0.25f * ipd_smooth[b][0][1] + 0.5f * ipd_smooth[b][1][1] + ipd_im;
> + float ipd_re_smooth = 0.25f * ipd_smooth[b][0][0] + 0.5f * ipd_smooth[b][1][0] + ipd_re;
> + opd_smooth[b][0][0] = opd_smooth[b][1][0];
> + opd_smooth[b][0][1] = opd_smooth[b][1][1];
> + opd_smooth[b][1][0] = opd_re;
> + opd_smooth[b][1][1] = opd_im;
> + ipd_smooth[b][0][0] = ipd_smooth[b][1][0];
> + ipd_smooth[b][0][1] = ipd_smooth[b][1][1];
> + ipd_smooth[b][1][0] = ipd_re;
> + ipd_smooth[b][1][1] = ipd_im;
> + opd_mag = 1 / sqrt(opd_im_smooth * opd_im_smooth + opd_re_smooth * opd_re_smooth);
> + ipd_mag = 1 / sqrt(ipd_im_smooth * ipd_im_smooth + ipd_re_smooth * ipd_re_smooth);
> + opd_re = opd_re_smooth * opd_mag;
> + opd_im = opd_im_smooth * opd_mag;
> + ipd_re = ipd_re_smooth * ipd_mag;
> + ipd_im = ipd_im_smooth * ipd_mag;
if i decyphered this nonsense correctly then
?pd_re/im values here can have 512 distinct values
thus a LUT that is indexed by 3 ps->o/ipd_par[e][b] values could be used
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I know you won't believe me, but the highest form of Human Excellence is
to question oneself and others. -- Socrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20100601/1df003ef/attachment.pgp>
More information about the ffmpeg-devel
mailing list