[FFmpeg-soc] [soc]: r2592 - in aacenc: aacenc.c aacpsy.c aacpsy.h
kostya
subversion at mplayerhq.hu
Mon Jun 30 08:42:09 CEST 2008
Author: kostya
Date: Mon Jun 30 08:42:09 2008
New Revision: 2592
Log:
'Real' psychoacoustic model based on 3GPP TS26.403.
It's not complete yet but at least it can do something
and encode close to specified bitrate.
Modified:
aacenc/aacenc.c
aacenc/aacpsy.c
aacenc/aacpsy.h
Modified: aacenc/aacenc.c
==============================================================================
--- aacenc/aacenc.c (original)
+++ aacenc/aacenc.c Mon Jun 30 08:42:09 2008
@@ -232,7 +232,7 @@ static int aac_encode_init(AVCodecContex
ff_kbd_window_init(s->kbd_long_1024, 4.0, 1024);
ff_kbd_window_init(s->kbd_short_128, 6.0, 128);
- ff_aac_psy_init(&s->psy, avctx, AAC_PSY_NULL, 0, s->swb_sizes1024, s->swb_num1024, s->swb_sizes128, s->swb_num128);
+ ff_aac_psy_init(&s->psy, avctx, AAC_PSY_3GPP, 0, s->swb_sizes1024, s->swb_num1024, s->swb_sizes128, s->swb_num128);
avctx->extradata = av_malloc(2);
avctx->extradata_size = 2;
put_audio_specific_config(avctx);
Modified: aacenc/aacpsy.c
==============================================================================
--- aacenc/aacpsy.c (original)
+++ aacenc/aacpsy.c Mon Jun 30 08:42:09 2008
@@ -232,6 +232,259 @@ static void psy_null8_process(AACPsyCont
}
}
+/**
+ * constants for 3GPP AAC psychoacoustic model
+ * @{
+ */
+#define PSY_3GPP_C1 3.0f // log2(8.0)
+#define PSY_3GPP_C2 1.32192809488736234787f // log2(2.5)
+#define PSY_3GPP_C3 0.55935730170421255071f // 1 - C2/C1
+/**
+ * @}
+ */
+
+/**
+ * information for single band used by 3GPP TS26.403-inspired psychoacoustic model
+ */
+typedef struct Psy3gppBand{
+ float energy; ///< band energy
+ float ffac; ///< form factor
+ float thr; ///< energy threshold
+ float pe; ///< perceptual entropy
+ float a; ///< constant part in perceptual entropy
+ float b; ///< variable part in perceptual entropy
+ float nl; ///< predicted number of lines left after quantization
+}Psy3gppBand;
+
+/**
+ * 3GPP TS26.403-inspired psychoacoustic model specific data
+ */
+typedef struct Psy3gppContext{
+ float barks [1024];
+ Psy3gppBand band[2][128];
+ float sfb_pe[2][128];
+ int reservoir;
+ int avg_bits;
+ float a[2];
+ float b[2];
+ float thr[2];
+}Psy3gppContext;
+
+/**
+ * Calculate Bark value for given line.
+ */
+static inline float calc_bark(float f)
+{
+ return 13.3f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f));
+}
+
+static int psy_3gpp_init(AACPsyContext *apc)
+{
+ Psy3gppContext *pctx;
+ int i;
+ apc->model_priv_data = av_mallocz(sizeof(Psy3gppContext));
+ pctx = (Psy3gppContext*) apc->model_priv_data;
+
+ for(i = 0; i < 1024; i++)
+ pctx->barks[i] = calc_bark(i * apc->avctx->sample_rate / 2048.0);
+
+ pctx->avg_bits = apc->avctx->bit_rate * 1024 / apc->avctx->sample_rate;
+ return 0;
+}
+
+/**
+ * Tell encoder which window types to use.
+ * @see 3GPP TS26.403 5.4.1
+ */
+static void psy_3gpp_window(AACPsyContext *apc, int16_t *audio, int channel, cpe_struct *cpe)
+{
+ int ch;
+
+//XXX: stub, because encoder does not support long to short window transition yet :(
+ for(ch = 0; ch < apc->avctx->channels; ch++){
+ cpe->ch[ch].ics.window_sequence = ONLY_LONG_SEQUENCE;
+ cpe->ch[ch].ics.window_shape = 1;
+ cpe->ch[ch].ics.num_windows = 1;
+ cpe->ch[ch].ics.swb_sizes = apc->bands1024;
+ cpe->ch[ch].ics.num_swb = apc->num_bands1024;
+ cpe->ch[ch].ics.group_len[0] = 0;
+ }
+ cpe->common_window = cpe->ch[0].ics.window_shape == cpe->ch[1].ics.window_shape;
+}
+
+/**
+ * Modify threshold by adding some value in loudness domain.
+ * @see 3GPP TS26.403 5.6.1.1.1
+ */
+static inline float modify_thr(float thr, float r){
+ float t;
+ t = pow(thr, 0.25) + r;
+ return t*t*t*t;
+}
+
+/**
+ * Determine scalefactors and prepare coefficients for encoding.
+ * @see 3GPP TS26.403 5.4
+ */
+static void psy_3gpp_process(AACPsyContext *apc, int16_t *audio, int channel, cpe_struct *cpe)
+{
+ int start, sum, maxsfb;
+ int ch, g, i;
+ int prev_scale;
+ Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
+ float stereo_att, pe_target;
+ int bits_avail;
+
+ //calculate and apply stereo attenuation factor - 5.2
+ if(apc->avctx->channels > 1){
+ float l, r;
+ stereo_att = 1.0 / 2.0; //XXX: find some way to determine it
+ for(i = 0; i < 1024; i++){
+ l = cpe->ch[0].coeffs[i];
+ r = cpe->ch[1].coeffs[i];
+ cpe->ch[0].coeffs[i] = (0.5 + stereo_att) * l + (0.5 - stereo_att) * r;
+ cpe->ch[1].coeffs[i] = (0.5 - stereo_att) * l + (0.5 + stereo_att) * r;
+ }
+ }
+
+ //calculate energies, initial thresholds and related values - 5.4.2
+ memset(pctx->band, 0, sizeof(pctx->band));
+ for(ch = 0; ch < apc->avctx->channels; ch++){
+ start = 0;
+ cpe->ch[ch].gain = 0;
+ for(g = 0; g < apc->num_bands1024; g++){
+ for(i = 0; i < apc->bands1024[g]; i++)
+ pctx->band[ch][g].energy += cpe->ch[ch].coeffs[start+i] * cpe->ch[ch].coeffs[start+i];
+ pctx->band[ch][g].energy *= 1048576.0;
+ pctx->band[ch][g].thr = pctx->band[ch][g].energy * 0.001258925f;
+ start += apc->bands1024[g];
+ if(pctx->band[ch][g].energy > pctx->band[ch][g].thr){
+ float ffac = 0.0;
+
+ for(i = 0; i < apc->bands1024[g]; i++)
+ ffac += sqrt(FFABS(cpe->ch[ch].coeffs[start+i]));
+ pctx->band[ch][g].ffac = ffac * 32.0;
+
+ pctx->band[ch][g].nl = pctx->band[ch][g].ffac / pow(pctx->band[ch][g].energy/apc->bands1024[g], 0.25);
+ if(pctx->band[ch][g].energy / pctx->band[ch][g].thr >= 8.0){
+ pctx->band[ch][g].a = pctx->band[ch][g].nl * log2(pctx->band[ch][g].energy);
+ pctx->band[ch][g].b = pctx->band[ch][g].nl;
+ }else{
+ pctx->band[ch][g].a = pctx->band[ch][g].nl * (PSY_3GPP_C2 + PSY_3GPP_C3 * log2(pctx->band[ch][g].energy));
+ pctx->band[ch][g].b = pctx->band[ch][g].nl * PSY_3GPP_C3;
+ }
+ pctx->sfb_pe[ch][g] = pctx->band[ch][g].a - pctx->band[ch][g].b * log2(pctx->band[ch][g].thr);
+ cpe->ch[ch].zeroes[0][g] = 0;
+ }else{
+ cpe->ch[ch].zeroes[0][g] = 1;
+ }
+ pctx->a[ch] += pctx->band[ch][g].a;
+ pctx->b[ch] += pctx->band[ch][g].b;
+ pctx->thr[ch] += pctx->band[ch][g].thr;
+ }
+ pctx->a[ch] /= 1024.0f;
+ pctx->b[ch] /= 1024.0f;
+ pctx->thr[ch] /= 1024.0f;
+ }
+
+ //modify thresholds - spread, threshold in quiet - 5.4.3
+ //TODO
+
+ // M/S detection - 5.5.2
+ if(apc->avctx->channels > 1 && cpe->common_window){
+ start = 0;
+ for(g = 0; g < cpe->ch[0].ics.num_swb; g++){
+ double en_m = 0.0, en_s = 0.0, l1;
+ float m, s;
+
+ cpe->ms.mask[0][g] = 0;
+ if(pctx->band[0][g].energy + pctx->band[1][g].energy == 0.0)
+ continue;
+ for(i = 0; i < cpe->ch[0].ics.swb_sizes[g]; i++){
+ m = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
+ s = (cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i]) / 2.0;
+ en_m += m*m;
+ en_s += s*s;
+ }
+ l1 = FFMIN(pctx->band[0][g].thr, pctx->band[1][g].thr);
+ l1 = l1*l1 / (en_m + en_s);
+ if(l1 >= (pctx->band[0][g].thr * pctx->band[1][g].thr / (pctx->band[0][g].energy + pctx->band[1][g].energy)))
+ cpe->ms.mask[0][g] = 1;
+ }
+ }
+
+ //bitrate reduction - 5.6.1
+ //TODO: add more that first step estimation
+ pctx->reservoir += pctx->avg_bits - apc->avctx->frame_bits;
+ bits_avail = pctx->avg_bits + pctx->reservoir;
+ pe_target = 1.18f * bits_avail / apc->avctx->channels / 1024.0f;
+ for(ch = 0; ch < apc->avctx->channels; ch++){
+ float t0, pe, r;
+ if(pctx->b[ch] == 0.0f) continue;
+ pe = pctx->a[ch] - pctx->b[ch] * 4.0f * log2(pow(pctx->thr[ch]/cpe->ch[ch].ics.num_swb, 0.25));
+ t0 = pow(2.0, (pctx->a[ch] - pe) / (4.0 * pctx->b[ch]));
+ r = pow(2.0, (pctx->a[ch] - pe_target) / (4.0 * pctx->b[ch])) - t0;
+
+ //add correction factor to thresholds
+ for(g = 0; g < apc->num_bands1024; g++)
+ pctx->band[ch][g].thr = modify_thr(pctx->band[ch][g].thr, r);
+ }
+
+ //determine scalefactors - 5.6.2
+ //TODO: quantization optimization, scalefactor difference reduction
+ for(ch = 0; ch < apc->avctx->channels; ch++){
+ prev_scale = -1;
+ cpe->ch[ch].gain = SCALE_ONE_POS;
+ for(g = 0; g < apc->num_bands1024; g++){
+ if(cpe->ch[ch].zeroes[0][g]) continue;
+ //spec gives constant for lg() but we scaled it for log2()
+ cpe->ch[ch].sf_idx[0][g] = (int)(2.66667 * (log2(6.75*pctx->band[ch][g].thr) - log2(pctx->band[ch][g].ffac)));
+ cpe->ch[ch].sf_idx[0][g] = av_clip(cpe->ch[ch].sf_idx[0][g], 0, 255);
+ if(prev_scale != -1)
+ cpe->ch[ch].sf_idx[0][g] = av_clip(cpe->ch[ch].sf_idx[0][g], prev_scale - SCALE_MAX_DIFF, prev_scale + SCALE_MAX_DIFF);
+ else
+ cpe->ch[ch].gain = cpe->ch[ch].sf_idx[0][g];
+ prev_scale = cpe->ch[ch].sf_idx[0][g];
+ }
+ }
+
+ for(ch = 0; ch < apc->avctx->channels; ch++){
+ start = 0;
+ cpe->ch[ch].pulse.present = 0;
+ for(g = 0; g < apc->num_bands1024; g++){
+ sum = 0;
+ //apply M/S
+ if(!ch && cpe->ms.mask[0][g]){
+ for(i = 0; i < apc->bands1024[g]; i++){
+ cpe->ch[0].coeffs[start+i] = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
+ cpe->ch[1].coeffs[start+i] = cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
+ }
+ }
+ if(cpe->ch[ch].sf_idx[0][g])
+ sum = convert_coeffs(cpe->ch[ch].coeffs + start, cpe->ch[ch].icoefs + start, apc->bands1024[g], cpe->ch[ch].sf_idx[0][g]);
+ cpe->ch[ch].zeroes[0][g] = !sum;
+ start += apc->bands1024[g];
+ }
+ for(maxsfb = apc->num_bands1024; maxsfb > 0 && cpe->ch[ch].zeroes[0][maxsfb-1]; maxsfb--);
+ cpe->ch[ch].ics.max_sfb = maxsfb;
+ }
+
+ if(apc->avctx->channels > 1 && cpe->common_window){
+ int msc = 0;
+ cpe->ch[0].ics.max_sfb = FFMAX(cpe->ch[0].ics.max_sfb, cpe->ch[1].ics.max_sfb);
+ cpe->ch[1].ics.max_sfb = cpe->ch[0].ics.max_sfb;
+ for(i = 0; i < cpe->ch[0].ics.max_sfb; i++)
+ if(cpe->ms.mask[0][i]) msc++;
+ if(msc == 0 || cpe->ch[0].ics.max_sfb == 0) cpe->ms.present = 0;
+ else cpe->ms.present = msc < cpe->ch[0].ics.max_sfb ? 1 : 2;
+ }
+}
+
+static void psy_3gpp_end(AACPsyContext *apc)
+{
+ av_freep(&apc->model_priv_data);
+}
+
static const AACPsyModel psy_models[AAC_NB_PSY_MODELS] =
{
{
@@ -248,6 +501,13 @@ static const AACPsyModel psy_models[AAC_
psy_null8_process,
NULL,
},
+ {
+ "3GPP TS 26.403-inspired model",
+ psy_3gpp_init,
+ psy_3gpp_window,
+ psy_3gpp_process,
+ psy_3gpp_end,
+ },
};
int ff_aac_psy_init(AACPsyContext *ctx, AVCodecContext *avctx, int model, int flags,
Modified: aacenc/aacpsy.h
==============================================================================
--- aacenc/aacpsy.h (original)
+++ aacenc/aacpsy.h Mon Jun 30 08:42:09 2008
@@ -28,6 +28,7 @@
enum AACPsyModelType{
AAC_PSY_NULL, // do nothing on frequencies
AAC_PSY_NULL8, // do nothing on frequencies but work with short windows
+ AAC_PSY_3GPP, // model following recommendations from 3GPP TS 26.403
AAC_NB_PSY_MODELS
};
More information about the FFmpeg-soc
mailing list