[FFmpeg-soc] [soc]: r3702 - in aacenc: aac_enc.patch aacenc.c aacpsy.c aacpsy.h lowpass.c lowpass.h psymodel.c psymodel.h
kostya
subversion at mplayerhq.hu
Tue Sep 2 08:14:15 CEST 2008
Author: kostya
Date: Tue Sep 2 08:14:14 2008
New Revision: 3702
Log:
Make encoder use generic psychoacoustic model interface and optimal quantizers search
Added:
aacenc/psymodel.c
aacenc/psymodel.h
Removed:
aacenc/aacpsy.h
aacenc/lowpass.c
aacenc/lowpass.h
Modified:
aacenc/aac_enc.patch
aacenc/aacenc.c
aacenc/aacpsy.c
Modified: aacenc/aac_enc.patch
==============================================================================
--- aacenc/aac_enc.patch (original)
+++ aacenc/aac_enc.patch Tue Sep 2 08:14:14 2008
@@ -6,7 +6,7 @@ index d4f6d1c..0ed9057 100644
OBJS-$(CONFIG_ENCODERS) += faandct.o jfdctfst.o jfdctint.o
-+OBJS-$(CONFIG_AAC_ENCODER) += aacenc.o aacpsy.o aactab.o lowpass.o mdct.o fft.o mpeg4audio.o
++OBJS-$(CONFIG_AAC_ENCODER) += aacenc.o aacpsy.o aactab.o psymodel.o iirfilter.o mdct.o fft.o mpeg4audio.o
OBJS-$(CONFIG_AAC_DECODER) += aac.o aactab.o mdct.o fft.o
OBJS-$(CONFIG_AASC_DECODER) += aasc.o
OBJS-$(CONFIG_AC3_DECODER) += ac3dec.o ac3tab.o ac3dec_data.o ac3.o mdct.o fft.o
Modified: aacenc/aacenc.c
==============================================================================
--- aacenc/aacenc.c (original)
+++ aacenc/aacenc.c Tue Sep 2 08:14:14 2008
@@ -26,7 +26,7 @@
/***********************************
* TODOs:
- * psy model selection with some option
+ * speedup quantizer selection
* add sane pulse detection
* add temporal noise shaping
***********************************/
@@ -36,10 +36,11 @@
#include "dsputil.h"
#include "mpeg4audio.h"
-#include "aacpsy.h"
#include "aac.h"
#include "aactab.h"
+#include "psymodel.h"
+
static const uint8_t swb_size_1024_96[] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8,
12, 12, 12, 12, 12, 16, 16, 24, 28, 36, 44,
@@ -192,7 +193,9 @@ typedef struct {
int samplerate_index; ///< MPEG-4 samplerate index
ChannelElement *cpe; ///< channel elements
- AACPsyContext psy; ///< psychoacoustic model context
+ FFPsyContext psy;
+ struct FFPsyPreprocessContext* psypp;
+ int cur_channel;
int last_frame;
} AACEncContext;
@@ -220,6 +223,8 @@ static av_cold int aac_encode_init(AVCod
{
AACEncContext *s = avctx->priv_data;
int i;
+ const uint8_t *sizes[2];
+ int lengths[2];
avctx->frame_size = 1024;
@@ -247,15 +252,22 @@ static av_cold int aac_encode_init(AVCod
s->samples = av_malloc(2 * 1024 * avctx->channels * sizeof(s->samples[0]));
s->cpe = av_mallocz(sizeof(ChannelElement) * aac_chan_configs[avctx->channels-1][0]);
- if(ff_aac_psy_init(&s->psy, avctx, AAC_PSY_3GPP,
- aac_chan_configs[avctx->channels-1][0], 0,
- swb_size_1024[i], ff_aac_num_swb_1024[i], swb_size_128[i], ff_aac_num_swb_128[i]) < 0){
- av_log(avctx, AV_LOG_ERROR, "Cannot initialize selected model.\n");
- return -1;
- }
avctx->extradata = av_malloc(2);
avctx->extradata_size = 2;
put_audio_specific_config(avctx);
+
+ sizes[0] = swb_size_1024[i];
+ sizes[1] = swb_size_128[i];
+ lengths[0] = ff_aac_num_swb_1024[i];
+ lengths[1] = ff_aac_num_swb_128[i];
+ ff_psy_init(&s->psy, avctx, 2, sizes, lengths);
+ s->psypp = ff_psy_preprocess_init(avctx);
+
+#ifndef CONFIG_HARDCODED_TABLES
+ for (i = 0; i < 316; i++)
+ ff_aac_pow2sf_tab[i] = pow(2, (i - 200)/4.);
+#endif /* CONFIG_HARDCODED_TABLES */
+
return 0;
}
@@ -351,6 +363,65 @@ static void encode_ms_info(PutBitContext
}
/**
+ * Quantize one coefficient.
+ * @return absolute value of the quantized coefficient
+ * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
+ */
+static av_always_inline int quant(float coef, const float Q)
+{
+ return av_clip((int)(pow(fabsf(coef) * Q, 0.75) + 0.4054), 0, 8191);
+}
+
+static inline float get_approximate_quant_error(const float *q, const int *c, int size, int scale_idx)
+{
+ int i;
+ float coef, unquant, sum = 0.0f;
+ const float IQ = ff_aac_pow2sf_tab[200 + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
+ for(i = 0; i < size; i++){
+ coef = fabsf(q[i]);
+ unquant = (c[i] * cbrt(c[i])) * IQ;
+ sum += (coef - unquant) * (coef - unquant);
+ }
+ return sum * 1.0;
+}
+
+/**
+ * Convert coefficients to integers.
+ * @fixme make it RD-optimal
+ * @return sum of coefficient absolute values
+ */
+static inline int quantize_band(const float *in, int *out, int size, int scale_idx)
+{
+ int i, sign, sum = 0;
+ const float Q = ff_aac_pow2sf_tab[200 - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+ for(i = 0; i < size; i++){
+ sign = in[i] > 0.0;
+ out[i] = quant(in[i], Q);
+ sum += out[i];
+ if(sign) out[i] = -out[i];
+ }
+ return sum;
+}
+
+static inline int get_approximate_bits(const int *in, int size)
+{
+ int i, bits = 0;
+ for(i = 0; i < size; i += 2){
+ int j, idx = 0;
+ for(j = 0; j < 2; j++){
+ int t = FFABS(in[i+j]);
+ if(t)
+ bits++;
+ if(t > 16)
+ bits += av_log2(t)*2 + 4 - 1;
+ idx = idx*17 + FFMIN(t, 16);
+ }
+ bits += ff_aac_spectral_bits[ESC_BT-1][idx];
+ }
+ return bits;
+}
+
+/**
* Calculate the number of bits needed to code all coefficient signs in current band.
*/
static int calculate_band_sign_bits(AACEncContext *s, SingleChannelElement *sce,
@@ -525,6 +596,178 @@ static void encode_window_bands_info(AAC
}
/**
+ * Produce integer coefficients from scalefactors provided by the model.
+ */
+static void quantize_coeffs(AACEncContext *apc, ChannelElement *cpe, int chans)
+{
+ int i, w, w2, g, ch;
+ int start, sum, maxsfb, cmaxsfb;
+
+ for(ch = 0; ch < chans; ch++){
+ IndividualChannelStream *ics = &cpe->ch[ch].ics;
+ start = 0;
+ maxsfb = 0;
+ cpe->ch[ch].pulse.num_pulse = 0;
+ for(w = 0; w < ics->num_windows*16; w += 16){
+ for(g = 0; g < ics->num_swb; g++){
+ sum = 0;
+ //apply M/S
+ if(!ch && cpe->ms_mask[w + g]){
+ for(i = 0; i < ics->swb_sizes[g]; i++){
+ cpe->ch[0].coeffs[start+i] = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
+ cpe->ch[1].coeffs[start+i] = cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
+ }
+ }
+ if(!cpe->ch[ch].zeroes[w + g])
+ sum = quantize_band(cpe->ch[ch].coeffs + start,
+ cpe->ch[ch].icoefs + start,
+ ics->swb_sizes[g],
+ cpe->ch[ch].sf_idx[w + g]);
+ else
+ memset(cpe->ch[ch].icoefs + start, 0, ics->swb_sizes[g] * sizeof(cpe->ch[0].icoefs[0]));
+ cpe->ch[ch].zeroes[w + g] = !sum;
+ start += ics->swb_sizes[g];
+ }
+ for(cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w+cmaxsfb-1]; cmaxsfb--);
+ maxsfb = FFMAX(maxsfb, cmaxsfb);
+ }
+ ics->max_sfb = maxsfb;
+
+ //adjust zero bands for window groups
+ for(w = 0; w < ics->num_windows; w += ics->group_len[w]){
+ for(g = 0; g < ics->max_sfb; g++){
+ i = 1;
+ for(w2 = w; w2 < w + ics->group_len[w]; w2++){
+ if(!cpe->ch[ch].zeroes[w2*16 + g]){
+ i = 0;
+ break;
+ }
+ }
+ cpe->ch[ch].zeroes[w*16 + g] = i;
+ }
+ }
+ }
+
+ if(chans > 1 && cpe->common_window){
+ IndividualChannelStream *ics0 = &cpe->ch[0].ics;
+ IndividualChannelStream *ics1 = &cpe->ch[1].ics;
+ int msc = 0;
+ ics0->max_sfb = FFMAX(ics0->max_sfb, ics1->max_sfb);
+ ics1->max_sfb = ics0->max_sfb;
+ for(w = 0; w < ics0->num_windows*16; w += 16)
+ for(i = 0; i < ics0->max_sfb; i++)
+ if(cpe->ms_mask[w+i]) msc++;
+ if(msc == 0 || ics0->max_sfb == 0) cpe->ms_mode = 0;
+ else cpe->ms_mode = msc < ics0->max_sfb ? 1 : 2;
+ }
+}
+
+typedef struct TrellisPath {
+ float cost;
+ int prev;
+} TrellisPath;
+
+static void search_for_quantizers(AACEncContext *s, SingleChannelElement *sce)
+{
+ int q, w, g, start = 0;
+ int i;
+ int qcoeffs[128];
+ int idx;
+ TrellisPath paths[256*128];
+ int bandaddr[128];
+ const float lambda = 5e-7f;
+ int minq = 0;
+ float mincost;
+ int stack[128], sptr = 0;
+
+ for(i = 0; i < 256; i++){
+ paths[i].cost = 0.0f;
+ paths[i].prev = -1;
+ }
+ for(i = 256; i < 256*128; i++){
+ paths[i].cost = INFINITY;
+ paths[i].prev = -2;
+ }
+ idx = 256;
+ for(w = 0; w < sce->ics.num_windows*16; w += 16){
+ for(g = 0; g < sce->ics.num_swb; g++){
+ const float *coefs = sce->coeffs + start;
+ float qmin, qmax, invthr;
+ int minscale, maxscale;
+ FFPsyBand *band = &s->psy.psy_bands[s->cur_channel*PSY_MAX_BANDS+w+g];
+
+ bandaddr[idx >> 8] = w+g;
+ if(band->energy <= band->threshold){
+ sce->zeroes[w+g] = 1;
+ for(q = 0; q < 256; q++){
+ for(i = FFMAX(q - SCALE_MAX_DIFF, 0); i < FFMIN(q + SCALE_MAX_DIFF, 256); i++){
+ float cost;
+ if(isinf(paths[idx - 256 + i].cost))
+ continue;
+ cost = paths[idx - 256 + i].cost + ff_aac_scalefactor_bits[q - i + SCALE_DIFF_ZERO];
+ if(cost < paths[idx + q].cost){
+ paths[idx + q].cost = cost;
+ paths[idx + q].prev = idx - 256 + i;
+ }
+ }
+ }
+ start += sce->ics.swb_sizes[g];
+ idx += 256;
+ continue;
+ }
+ sce->zeroes[w+g] = 0;
+ qmin = qmax = fabsf(coefs[0]);
+ if(qmin == 0.0f) qmin = INT_MAX;
+ for(i = 1; i < sce->ics.swb_sizes[g]; i++){
+ float t = fabsf(coefs[i]);
+ if(t > 0.0f) qmin = fminf(qmin, t);
+ qmax = fmaxf(qmax, t);
+ }
+ //minimum scalefactor index is when mininum nonzero coefficient after quantizing is not clipped
+ minscale = av_clip_uint8(log2(qmin)*4 - 69 + SCALE_ONE_POS - SCALE_DIV_512);
+ //maximum scalefactor index is when maximum coefficient after quantizing is still not zero
+ maxscale = av_clip_uint8(log2(qmax)*4 + 6 + SCALE_ONE_POS - SCALE_DIV_512);
+ invthr = (band->threshold == 0.0f) ? INFINITY : 1.0 / band->threshold;
+ for(q = minscale; q < maxscale; q++){
+ float dist;
+ int bits, sum;
+ sum = quantize_band(coefs, qcoeffs, sce->ics.swb_sizes[g], q);
+ dist = get_approximate_quant_error(coefs, qcoeffs, sce->ics.swb_sizes[g], q);
+ bits = get_approximate_bits(qcoeffs, sce->ics.swb_sizes[g]);
+ for(i = FFMAX(q - SCALE_MAX_DIFF, 0); i < FFMIN(q + SCALE_MAX_DIFF, 256); i++){
+ float cost;
+ if(isinf(paths[idx - 256 + i].cost))
+ continue;
+ cost = paths[idx - 256 + i].cost + dist * invthr * lambda + bits
+ + ff_aac_scalefactor_bits[q - i + SCALE_DIFF_ZERO];
+ if(cost < paths[idx + q].cost){
+ paths[idx + q].cost = cost;
+ paths[idx + q].prev = idx - 256 + i;
+ }
+ }
+ }
+ start += sce->ics.swb_sizes[g];
+ idx += 256;
+ }
+ }
+ idx -= 256;
+ mincost = paths[idx].cost;
+ for(i = 1; i < 256; i++){
+ if(paths[idx + i].cost < mincost){
+ mincost = paths[idx + i].cost;
+ minq = idx + i;
+ }
+ }
+ while(minq >= 0){
+ stack[sptr++] = minq;
+ minq = paths[minq].prev;
+ }
+ for(i = sptr - 2; i >= 0; i--){
+ sce->sf_idx[bandaddr[stack[i]>>8]] = stack[i]&0xFF;
+ }
+}
+
+/**
* Encode the coefficients of one scalefactor band with selected codebook.
*/
static void encode_band_coeffs(AACEncContext *s, SingleChannelElement *sce,
@@ -600,9 +843,9 @@ static void encode_band_info(AACEncConte
/**
* Encode scalefactors.
*/
-static void encode_scale_factors(AVCodecContext *avctx, AACEncContext *s, SingleChannelElement *sce, int global_gain)
+static void encode_scale_factors(AVCodecContext *avctx, AACEncContext *s, SingleChannelElement *sce)
{
- int off = global_gain, diff;
+ int off = sce->sf_idx[0], diff;
int i, w;
for(w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]){
@@ -664,36 +907,10 @@ static void encode_spectral_coeffs(AACEn
*/
static int encode_individual_channel(AVCodecContext *avctx, AACEncContext *s, SingleChannelElement *sce, int common_window)
{
- int g, w;
- int global_gain, last = 256;
-
- //determine global gain as standard recommends - the first scalefactor value
- //and assign an appropriate scalefactor index to empty bands
- for(w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]){
- for(g = sce->ics.max_sfb - 1; g >= 0; g--){
- if(sce->sf_idx[w*16 + g] == 256)
- sce->sf_idx[w*16 + g] = last;
- else
- last = sce->sf_idx[w*16 + g];
- }
- }
- //make sure global gain won't be 256
- last &= 0xFF;
- global_gain = last;
- //assign scalefactor index to tail bands in case encoder decides to code them
- for(w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]){
- for(g = 0; g < sce->ics.max_sfb; g++){
- if(sce->sf_idx[w*16 + g] == 256)
- sce->sf_idx[w*16 + g] = last;
- else
- last = sce->sf_idx[w*16 + g];
- }
- }
-
- put_bits(&s->pb, 8, global_gain);
+ put_bits(&s->pb, 8, sce->sf_idx[0]);
if(!common_window) put_ics_info(s, &sce->ics);
encode_band_info(s, sce);
- encode_scale_factors(avctx, s, sce, global_gain);
+ encode_scale_factors(avctx, s, sce);
encode_pulses(s, &sce->pulse);
put_bits(&s->pb, 1, 0); //tns
put_bits(&s->pb, 1, 0); //ssr
@@ -734,7 +951,7 @@ static int aac_encode_frame(AVCodecConte
if(s->last_frame)
return 0;
if(data){
- if((s->psy.flags & PSY_MODEL_NO_PREPROC) == PSY_MODEL_NO_PREPROC){
+ if(!s->psypp){
memcpy(s->samples + 1024 * avctx->channels, data, 1024 * avctx->channels * sizeof(s->samples[0]));
}else{
start_ch = 0;
@@ -742,7 +959,7 @@ static int aac_encode_frame(AVCodecConte
for(i = 0; i < chan_map[0]; i++){
tag = chan_map[i+1];
chans = tag == TYPE_CPE ? 2 : 1;
- ff_aac_psy_preprocess(&s->psy, (uint16_t*)data + start_ch, samples2 + start_ch, i, tag);
+ ff_psy_preprocess(s->psypp, (uint16_t*)data + start_ch, samples2 + start_ch, start_ch + i, chans);
start_ch += chans;
}
}
@@ -759,17 +976,44 @@ static int aac_encode_frame(AVCodecConte
start_ch = 0;
memset(chan_el_counter, 0, sizeof(chan_el_counter));
for(i = 0; i < chan_map[0]; i++){
+ FFPsyWindowInfo wi[2];
tag = chan_map[i+1];
chans = tag == TYPE_CPE ? 2 : 1;
cpe = &s->cpe[i];
samples2 = samples + start_ch;
la = samples2 + 1024 * avctx->channels + start_ch;
if(!data) la = NULL;
- ff_aac_psy_suggest_window(&s->psy, samples2, la, i, tag, cpe);
for(j = 0; j < chans; j++){
+ IndividualChannelStream *ics = &cpe->ch[j].ics;
+ int k;
+ wi[j] = ff_psy_suggest_window(&s->psy, samples2, la, start_ch + j, ics->window_sequence[0]);
+ ics->window_sequence[1] = ics->window_sequence[0];
+ ics->window_sequence[0] = wi[j].window_type[0];
+ ics->use_kb_window[1] = ics->use_kb_window[0];
+ ics->use_kb_window[0] = wi[j].window_shape;
+ ics->num_windows = wi[j].num_windows;
+ ics->swb_sizes = s->psy.bands [ics->num_windows == 8];
+ ics->num_swb = s->psy.num_bands[ics->num_windows == 8];
+ for(k = 0; k < ics->num_windows; k++)
+ ics->group_len[k] = wi[j].grouping[k];
+
apply_window_and_mdct(avctx, s, &cpe->ch[j], samples2, j);
+ search_for_quantizers(s, &cpe->ch[j]);
}
- ff_aac_psy_analyze(&s->psy, i, tag, cpe);
+ cpe->common_window = 0;
+ if(chans > 1
+ && wi[0].window_type[0] == wi[1].window_type[0]
+ && wi[0].window_shape == wi[1].window_shape){
+
+ cpe->common_window = 1;
+ for(j = 0; j < wi[0].num_windows; j++){
+ if(wi[0].grouping[j] != wi[1].grouping[j]){
+ cpe->common_window = 0;
+ break;
+ }
+ }
+ }
+ quantize_coeffs(s, cpe, chans);
put_bits(&s->pb, 3, tag);
put_bits(&s->pb, 4, chan_el_counter[tag]++);
if(chans == 2){
@@ -780,6 +1024,8 @@ static int aac_encode_frame(AVCodecConte
}
}
for(j = 0; j < chans; j++){
+ s->cur_channel = start_ch + j;
+ ff_psy_set_band_info(&s->psy, s->cur_channel, cpe->ch[j].coeffs, &wi[j]);
encode_individual_channel(avctx, s, &cpe->ch[j], cpe->common_window);
}
start_ch += chans;
@@ -801,7 +1047,8 @@ static av_cold int aac_encode_end(AVCode
ff_mdct_end(&s->mdct1024);
ff_mdct_end(&s->mdct128);
- ff_aac_psy_end(&s->psy);
+ ff_psy_end(&s->psy);
+ ff_psy_preprocess_end(s->psypp);
av_freep(&s->samples);
av_freep(&s->cpe);
return 0;
Modified: aacenc/aacpsy.c
==============================================================================
--- aacenc/aacpsy.c (original)
+++ aacenc/aacpsy.c Tue Sep 2 08:14:14 2008
@@ -25,140 +25,20 @@
*/
#include "avcodec.h"
-#include "aacpsy.h"
#include "aactab.h"
+#include "psymodel.h"
/***********************************
* TODOs:
- * General:
- * better audio preprocessing (add DC highpass filter?)
- * more psy models
- * maybe improve coefficient quantization function in some way
- *
- * 3GPP-based psy model:
* thresholds linearization after their modifications for attaining given bitrate
* try other bitrate controlling mechanism (maybe use ratecontrol.c?)
* control quality for quality-based output
**********************************/
/**
- * Quantize one coefficient.
- * @return absolute value of the quantized coefficient
- * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
- */
-static av_always_inline int quant(float coef, const float Q)
-{
- return av_clip((int)(pow(fabsf(coef) * Q, 0.75) + 0.4054), 0, 8191);
-}
-
-/**
- * Convert coefficients to integers.
- * @return sum of coefficient absolute values
- */
-static inline int quantize_coeffs(float *in, int *out, int size, int scale_idx)
-{
- int i, sign, sum = 0;
- const float Q = ff_aac_pow2sf_tab[200 - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
- for(i = 0; i < size; i++){
- sign = in[i] > 0.0;
- out[i] = quant(in[i], Q);
- sum += out[i];
- if(sign) out[i] = -out[i];
- }
- return sum;
-}
-
-static inline float get_approximate_quant_error(float *c, int size, int scale_idx)
-{
- int i;
- int q;
- float coef, unquant, sum = 0.0f;
- const float Q = ff_aac_pow2sf_tab[200 - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
- const float IQ = ff_aac_pow2sf_tab[200 + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
- for(i = 0; i < size; i++){
- coef = fabs(c[i]);
- q = quant(c[i], Q);
- unquant = (q * cbrt(q)) * IQ;
- sum += (coef - unquant) * (coef - unquant);
- }
- return sum;
-}
-
-/**
- * Produce integer coefficients from scalefactors provided by the model.
- */
-static void psy_create_output(AACPsyContext *apc, ChannelElement *cpe, int chans)
-{
- int i, w, w2, g, ch;
- int start, sum, maxsfb, cmaxsfb;
-
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- start = 0;
- maxsfb = 0;
- cpe->ch[ch].pulse.num_pulse = 0;
- for(w = 0; w < ics->num_windows*16; w += 16){
- for(g = 0; g < ics->num_swb; g++){
- sum = 0;
- //apply M/S
- if(!ch && cpe->ms_mask[w + g]){
- for(i = 0; i < ics->swb_sizes[g]; i++){
- cpe->ch[0].coeffs[start+i] = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
- cpe->ch[1].coeffs[start+i] = cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
- }
- }
- if(!cpe->ch[ch].zeroes[w + g])
- sum = quantize_coeffs(cpe->ch[ch].coeffs + start,
- cpe->ch[ch].icoefs + start,
- ics->swb_sizes[g],
- cpe->ch[ch].sf_idx[w + g]);
- else
- memset(cpe->ch[ch].icoefs + start, 0, ics->swb_sizes[g] * sizeof(cpe->ch[0].icoefs[0]));
- cpe->ch[ch].zeroes[w + g] = !sum;
- start += ics->swb_sizes[g];
- }
- for(cmaxsfb = ics->num_swb; cmaxsfb > 0 && cpe->ch[ch].zeroes[w+cmaxsfb-1]; cmaxsfb--);
- maxsfb = FFMAX(maxsfb, cmaxsfb);
- }
- ics->max_sfb = maxsfb;
-
- //adjust zero bands for window groups
- for(w = 0; w < ics->num_windows; w += ics->group_len[w]){
- for(g = 0; g < ics->max_sfb; g++){
- i = 1;
- for(w2 = w; w2 < w + ics->group_len[w]; w2++){
- if(!cpe->ch[ch].zeroes[w2*16 + g]){
- i = 0;
- break;
- }
- }
- cpe->ch[ch].zeroes[w*16 + g] = i;
- }
- }
- }
-
- if(chans > 1 && cpe->common_window){
- IndividualChannelStream *ics0 = &cpe->ch[0].ics;
- IndividualChannelStream *ics1 = &cpe->ch[1].ics;
- int msc = 0;
- ics0->max_sfb = FFMAX(ics0->max_sfb, ics1->max_sfb);
- ics1->max_sfb = ics0->max_sfb;
- for(w = 0; w < ics0->num_windows*16; w += 16)
- for(i = 0; i < ics0->max_sfb; i++)
- if(cpe->ms_mask[w+i]) msc++;
- if(msc == 0 || ics0->max_sfb == 0) cpe->ms_mode = 0;
- else cpe->ms_mode = msc < ics0->max_sfb ? 1 : 2;
- }
-}
-
-/**
* constants for 3GPP AAC psychoacoustic model
* @{
*/
-#define PSY_3GPP_C1 3.0f // log2(8.0)
-#define PSY_3GPP_C2 1.32192809488736234787f // log2(2.5)
-#define PSY_3GPP_C3 0.55935730170421255071f // 1 - C2/C1
-
#define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark)
#define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark)
@@ -175,10 +55,6 @@ typedef struct Psy3gppBand{
float energy; ///< band energy
float ffac; ///< form factor
float thr; ///< energy threshold
- float pe; ///< perceptual entropy
- float a; ///< constant part in perceptual entropy
- float b; ///< variable part in perceptual entropy
- float nl; ///< predicted number of lines left after quantization
float min_snr; ///< minimal SNR
float thr_quiet; ///< threshold in quiet
}Psy3gppBand;
@@ -187,17 +63,13 @@ typedef struct Psy3gppBand{
* single/pair channel context for psychoacoustic model
*/
typedef struct Psy3gppChannel{
- float a[2]; ///< parameter used for perceptual entropy - constant part
- float b[2]; ///< parameter used for perceptual entropy - variable part
- float pe[2]; ///< channel perceptual entropy
- float thr[2]; ///< channel thresholds sum
- Psy3gppBand band[2][128]; ///< bands information
- Psy3gppBand prev_band[2][128]; ///< bands information from the previous frame
+ Psy3gppBand band[128]; ///< bands information
+ Psy3gppBand prev_band[128]; ///< bands information from the previous frame
- float win_energy[2]; ///< sliding average of channel energy
- float iir_state[2][2]; ///< hi-pass IIR filter state
- uint8_t next_grouping[2]; ///< stored grouping scheme for the next frame (in case of 8 short window sequence)
- enum WindowSequence next_window_seq[2]; ///< window sequence to be used in the next frame
+ float win_energy; ///< sliding average of channel energy
+ float iir_state[2]; ///< hi-pass IIR filter state
+ uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence)
+ enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame
}Psy3gppChannel;
/**
@@ -215,15 +87,13 @@ typedef struct Psy3gppCoeffs{
*/
typedef struct Psy3gppContext{
Psy3gppCoeffs psy_coef[2];
- int reservoir; ///< bit reservoir fullness
- int avg_bits; ///< average frame size of bits for CBR
Psy3gppChannel *ch;
}Psy3gppContext;
/**
* Calculate Bark value for given line.
*/
-static inline float calc_bark(float f)
+static av_cold float calc_bark(float f)
{
return 13.3f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f));
}
@@ -233,7 +103,7 @@ static inline float calc_bark(float f)
* Calculate ATH value for given frequency.
* Borrowed from Lame.
*/
-static inline float ath(float f, float add)
+static av_cold float ath(float f, float add)
{
f /= 1000.0f;
return 3.64 * pow(f, -0.8)
@@ -242,46 +112,43 @@ static inline float ath(float f, float a
+ (0.6 + 0.04 * add) * 0.001 * f * f * f * f;
}
-static av_cold int psy_3gpp_init(AACPsyContext *apc, int elements)
-{
+static av_cold int psy_3gpp_init(FFPsyContext *ctx){
Psy3gppContext *pctx;
float barks[1024];
int i, j, g, start;
float prev, minscale, minath;
- apc->model_priv_data = av_mallocz(sizeof(Psy3gppContext));
- pctx = (Psy3gppContext*) apc->model_priv_data;
+
+ ctx->model_priv_data = av_mallocz(sizeof(Psy3gppContext));
+ pctx = (Psy3gppContext*) ctx->model_priv_data;
for(i = 0; i < 1024; i++)
- barks[i] = calc_bark(i * apc->avctx->sample_rate / 2048.0);
+ barks[i] = calc_bark(i * ctx->avctx->sample_rate / 2048.0);
minath = ath(3410, ATH_ADD);
for(j = 0; j < 2; j++){
Psy3gppCoeffs *coeffs = &pctx->psy_coef[j];
- int bands = j ? apc->num_bands128 : apc->num_bands1024;
i = 0;
prev = 0.0;
- for(g = 0; g < bands; g++){
- i += j ? apc->bands128[g] : apc->bands1024[g];
+ for(g = 0; g < ctx->num_bands[j]; g++){
+ i += ctx->bands[j][g];
coeffs->barks[g] = (barks[i - 1] + prev) / 2.0;
prev = barks[i - 1];
}
- for(g = 0; g < bands - 1; g++){
+ for(g = 0; g < ctx->num_bands[j] - 1; g++){
coeffs->spread_low[g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_LOW);
coeffs->spread_hi [g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_HI);
}
start = 0;
- for(g = 0; g < bands; g++){
- int size = j ? apc->bands128[g] : apc->bands1024[g];
- minscale = ath(apc->avctx->sample_rate * start / 1024.0, ATH_ADD);
- for(i = 1; i < size; i++){
- minscale = fminf(minscale, ath(apc->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
+ for(g = 0; g < ctx->num_bands[j]; g++){
+ minscale = ath(ctx->avctx->sample_rate * start / 1024.0, ATH_ADD);
+ for(i = 1; i < ctx->bands[j][g]; i++){
+ minscale = fminf(minscale, ath(ctx->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD));
}
coeffs->ath[g] = minscale - minath;
- start += size;
+ start += ctx->bands[j][g];
}
}
- pctx->avg_bits = apc->avctx->bit_rate * 1024 / apc->avctx->sample_rate;
- pctx->ch = av_mallocz(sizeof(Psy3gppChannel) * elements);
+ pctx->ch = av_mallocz(sizeof(Psy3gppChannel) * ctx->avctx->channels);
return 0;
}
@@ -309,519 +176,146 @@ static const uint8_t window_grouping[9]
* Tell encoder which window types to use.
* @see 3GPP TS26.403 5.4.1 "Blockswitching"
*/
-static void psy_3gpp_window(AACPsyContext *apc, int16_t *audio, int16_t *la,
- int tag, int type, ChannelElement *cpe)
+static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
+ const int16_t *audio, const int16_t *la,
+ int channel, int prev_type)
{
- int ch;
- int chans = type == TYPE_CPE ? 2 : 1;
int i, j;
- int br = apc->avctx->bit_rate / apc->avctx->channels;
- int attack_ratio = (br <= 16000 + 8000*chans) ? 18 : 10;
- Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
- Psy3gppChannel *pch = &pctx->ch[tag];
- uint8_t grouping[2];
- enum WindowSequence win[2];
- IndividualChannelStream *ics0 = &cpe->ch[0].ics, *ics1 = &cpe->ch[1].ics;
+ int br = ctx->avctx->bit_rate / ctx->avctx->channels;
+ int attack_ratio = br <= 16000 ? 18 : 10;
+ Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
+ Psy3gppChannel *pch = &pctx->ch[channel];
+ uint8_t grouping = 0;
+ FFPsyWindowInfo wi;
- if(la && !(apc->flags & PSY_MODEL_NO_SWITCH)){
+ memset(&wi, 0, sizeof(wi));
+ if(la){
float s[8], v;
- for(ch = 0; ch < chans; ch++){
- enum WindowSequence last_window_sequence = cpe->ch[ch].ics.window_sequence[0];
- int switch_to_eight = 0;
- float sum = 0.0, sum2 = 0.0;
- int attack_n = 0;
- for(i = 0; i < 8; i++){
- for(j = 0; j < 128; j++){
- v = iir_filter(audio[(i*128+j)*apc->avctx->channels+ch], pch->iir_state[ch]);
- sum += v*v;
- }
- s[i] = sum;
- sum2 += sum;
- }
- for(i = 0; i < 8; i++){
- if(s[i] > pch->win_energy[ch] * attack_ratio){
- attack_n = i + 1;
- switch_to_eight = 1;
- break;
- }
+ int switch_to_eight = 0;
+ float sum = 0.0, sum2 = 0.0;
+ int attack_n = 0;
+ for(i = 0; i < 8; i++){
+ for(j = 0; j < 128; j++){
+ v = iir_filter(audio[(i*128+j)*ctx->avctx->channels], pch->iir_state);
+ sum += v*v;
}
- pch->win_energy[ch] = pch->win_energy[ch]*7/8 + sum2/64;
-
- switch(last_window_sequence){
- case ONLY_LONG_SEQUENCE:
- win[ch] = switch_to_eight ? LONG_START_SEQUENCE : ONLY_LONG_SEQUENCE;
- grouping[ch] = 0;
- break;
- case LONG_START_SEQUENCE:
- win[ch] = EIGHT_SHORT_SEQUENCE;
- grouping[ch] = pch->next_grouping[ch];
- break;
- case LONG_STOP_SEQUENCE:
- win[ch] = ONLY_LONG_SEQUENCE;
- grouping[ch] = 0;
- break;
- case EIGHT_SHORT_SEQUENCE:
- win[ch] = switch_to_eight ? EIGHT_SHORT_SEQUENCE : LONG_STOP_SEQUENCE;
- grouping[ch] = switch_to_eight ? pch->next_grouping[ch] : 0;
+ s[i] = sum;
+ sum2 += sum;
+ }
+ for(i = 0; i < 8; i++){
+ if(s[i] > pch->win_energy * attack_ratio){
+ attack_n = i + 1;
+ switch_to_eight = 1;
break;
}
- pch->next_grouping[ch] = window_grouping[attack_n];
}
- }else{
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- win[ch] = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE)
- ? EIGHT_SHORT_SEQUENCE
- : ONLY_LONG_SEQUENCE;
- grouping[ch] = (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) ? window_grouping[0] : 0;
- }
- }
+ pch->win_energy = pch->win_energy*7/8 + sum2/64;
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- ics->window_sequence[0] = win[ch];
- ics->use_kb_window[0] = 1;
- if(win[ch] != EIGHT_SHORT_SEQUENCE){
- ics->num_windows = 1;
- ics->swb_sizes = apc->bands1024;
- ics->num_swb = apc->num_bands1024;
- ics->num_window_groups = 1;
- ics->group_len[0] = 1;
- }else{
- int lastgrp = 0;
- ics->num_windows = 8;
- ics->swb_sizes = apc->bands128;
- ics->num_swb = apc->num_bands128;
- ics->num_window_groups = 0;
- memset(ics->group_len, 0, sizeof(ics->group_len));
- for(i = 0; i < 8; i++){
- if(!((grouping[ch] >> i) & 1))
- lastgrp = i;
- ics->group_len[lastgrp]++;
- }
+ wi.window_type[1] = prev_type;
+ switch(prev_type){
+ case ONLY_LONG_SEQUENCE:
+ wi.window_type[0] = switch_to_eight ? LONG_START_SEQUENCE : ONLY_LONG_SEQUENCE;
+ break;
+ case LONG_START_SEQUENCE:
+ wi.window_type[0] = EIGHT_SHORT_SEQUENCE;
+ grouping = pch->next_grouping;
+ break;
+ case LONG_STOP_SEQUENCE:
+ wi.window_type[0] = ONLY_LONG_SEQUENCE;
+ break;
+ case EIGHT_SHORT_SEQUENCE:
+ wi.window_type[0] = switch_to_eight ? EIGHT_SHORT_SEQUENCE : LONG_STOP_SEQUENCE;
+ grouping = switch_to_eight ? pch->next_grouping : 0;
+ break;
}
+ pch->next_grouping = window_grouping[attack_n];
+ }else{
+ for(i = 0; i < 3; i++)
+ wi.window_type[i] = prev_type;
+ grouping = (prev_type == EIGHT_SHORT_SEQUENCE) ? window_grouping[0] : 0;
}
- cpe->common_window = 0;
- if(chans > 1
- && ics0->window_sequence[0] == ics1->window_sequence[0]
- && ics0->use_kb_window[0] == ics1->use_kb_window[0]
- && !(ics0->window_sequence[0] == EIGHT_SHORT_SEQUENCE && grouping[0] != grouping[1]))
- cpe->common_window = 1;
- if(PSY_MODEL_MODE(apc->flags) > PSY_MODE_QUALITY){
- av_log(apc->avctx, AV_LOG_ERROR, "Unknown mode %d, defaulting to CBR\n", PSY_MODEL_MODE(apc->flags));
- }
-}
-/**
- * Modify threshold by adding some value in loudness domain.
- * @see 3GPP TS26.403 5.6.1.1.1 "Addition of noise with equal loudness"
- */
-static inline float modify_thr(float thr, float r){
- float t;
- t = pow(thr, 0.25) + r;
- return (t*t)*(t*t);
-}
-
-/**
- * Calculate perceptual entropy and its corresponding values for one band.
- * @see 3GPP TS26.403 5.6.1.3 "Calculation of the reduction value"
- */
-static void calc_pe(Psy3gppBand *band, int band_width)
-{
- if(band->energy <= band->thr){
- band->a = 0.0f;
- band->b = 0.0f;
- band->nl = 0.0f;
- return;
- }
- band->nl = band->ffac / pow(band->energy/band_width, 0.25);
- if(band->energy >= band->thr * 8.0){
- band->a = band->nl * log2(band->energy);
- band->b = band->nl;
+ wi.window_shape = 1;
+ if(wi.window_type[0] != EIGHT_SHORT_SEQUENCE){
+ wi.num_windows = 1;
+ wi.grouping[0] = 1;
}else{
- band->a = band->nl * (PSY_3GPP_C2 + PSY_3GPP_C3 * log2(band->energy));
- band->b = band->nl * PSY_3GPP_C3;
+ int lastgrp = 0;
+ wi.num_windows = 8;
+ for(i = 0; i < 8; i++){
+ if(!((grouping >> i) & 1))
+ lastgrp = i;
+ wi.grouping[lastgrp]++;
+ }
}
- band->pe = band->a - band->b * log2(band->thr);
- band->min_snr = 1.0 / (pow(2.0, band->pe / band_width) - 1.5);
- band->min_snr = av_clipf(band->min_snr, 1.26f, 316.2277f);
-}
-/**
- * Determine scalefactor from band threshold and form factor.
- * @see 3GPP TS26.403 5.4 5.6.2 "Scalefactor determination"
- */
-static inline int determine_scalefactor(Psy3gppBand *band)
-{
- //spec gives constant for lg() but we scaled it for log2()
- return (int)(2.66667 * log2(6.75*band->thr/band->ffac));
+ return wi;
}
/**
- * Determine scalefactors and prepare coefficients for encoding.
- * @see 3GPP TS26.403 5.4 "Psychoacoustic model"
+ * Calculate band thresholds as suggested in 3GPP TS26.403
*/
-static void psy_3gpp_process(AACPsyContext *apc, int tag, int type, ChannelElement *cpe)
+static void psy_3gpp_analyze(FFPsyContext *ctx, int channel, const float *coefs,
+ FFPsyWindowInfo *wi)
{
- int start;
- int ch, w, g, i;
- Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
- float pe_target;
- int bits_avail;
- int chans = type == TYPE_CPE ? 2 : 1;
- Psy3gppChannel *pch = &pctx->ch[tag];
+ Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data;
+ Psy3gppChannel *pch = &pctx->ch[channel];
+ int start = 0;
+ int i, w, g;
+ const int num_bands = ctx->num_bands[wi->num_windows == 8];
+ const uint8_t* band_sizes = ctx->bands[wi->num_windows == 8];
+ Psy3gppCoeffs *coeffs = &pctx->psy_coef[wi->num_windows == 8];
//calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
- memset(pch->band, 0, sizeof(pch->band));
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- start = 0;
- for(w = 0; w < ics->num_windows*16; w += 16){
- for(g = 0; g < ics->num_swb; g++){
- Psy3gppBand *band = &pch->band[ch][w+g];
- for(i = 0; i < ics->swb_sizes[g]; i++)
- band->energy += cpe->ch[ch].coeffs[start+i] * cpe->ch[ch].coeffs[start+i];
- band->energy *= 1.0f / (512*512);
- band->thr = band->energy * 0.001258925f;
- start += ics->swb_sizes[g];
- if(band->energy != 0.0){
- float ffac = 0.0;
+ for(w = 0; w < wi->num_windows*16; w += 16){
+ for(g = 0; g < num_bands; g++){
+ Psy3gppBand *band = &pch->band[w+g];
+ for(i = 0; i < band_sizes[g]; i++)
+ band->energy += coefs[start+i] * coefs[start+i];
+ band->energy *= 1.0f / (512*512);
+ band->thr = band->energy * 0.001258925f;
+ start += band_sizes[g];
- for(i = 0; i < ics->swb_sizes[g]; i++)
- ffac += sqrt(FFABS(cpe->ch[ch].coeffs[start+i]));
- band->ffac = ffac / sqrt(512.0);
- }
- }
+ ctx->psy_bands[channel*PSY_MAX_BANDS+w+g].energy = band->energy;
}
}
-
//modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation"
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- Psy3gppCoeffs *coeffs = &pctx->psy_coef[ics->num_windows == 8];
- for(w = 0; w < ics->num_windows*16; w += 16){
- Psy3gppBand *band = &pch->band[ch][w];
- for(g = 1; g < ics->num_swb; g++){
- band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_low[g-1]);
- }
- for(g = ics->num_swb - 2; g >= 0; g--){
- band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_hi [g+1]);
- }
- for(g = 0; g < ics->num_swb; g++){
- band[g].thr_quiet = FFMAX(band[g].thr, coeffs->ath[g]);
- band[g].thr_quiet = fmaxf(PSY_3GPP_RPEMIN*band[g].thr_quiet,
- fminf(band[g].thr_quiet,
- PSY_3GPP_RPELEV*pch->prev_band[ch][w+g].thr_quiet));
- band[g].thr = FFMAX(band[g].thr, band[g].thr_quiet * 0.25);
- }
- }
- }
-
- // M/S detection - 5.5.2 "Mid/Side Stereo"
- if(chans > 1 && cpe->common_window){
- start = 0;
- for(w = 0; w < cpe->ch[0].ics.num_windows*16; w += 16){
- for(g = 0; g < cpe->ch[0].ics.num_swb; g++){
- Psy3gppBand *band0 = &pch->band[0][w+g];
- Psy3gppBand *band1 = &pch->band[1][w+g];
- double en_m = 0.0, en_s = 0.0, ff_m = 0.0, ff_s = 0.0, minthr;
- float m, s;
-
- cpe->ms_mask[w+g] = 0;
- if(band0->energy == 0.0 || band1->energy == 0.0)
- continue;
- for(i = 0; i < cpe->ch[0].ics.swb_sizes[g]; i++){
- m = cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i];
- s = cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
- en_m += m*m;
- en_s += s*s;
- }
- en_m *= 1.0f / (512*512*4);
- en_s *= 1.0f / (512*512*4);
- minthr = FFMIN(band0->thr, band1->thr);
- if(minthr * minthr * band0->energy * band1->energy >= band0->thr * band1->thr * en_m * en_s){
- cpe->ms_mask[w+g] = 1;
- band0->energy = en_m;
- band1->energy = en_s;
- band0->thr = en_m * 0.001258925f;
- band1->thr = en_s * 0.001258925f;
- for(i = 0; i < cpe->ch[0].ics.swb_sizes[g]; i++){
- m = cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i];
- s = cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i];
- ff_m += sqrt(fabs(m));
- ff_s += sqrt(fabs(s));
- }
- band0->ffac = ff_m * (1.0f / 32.0f); // sqrt(512)*sqrt(2)
- band1->ffac = ff_s * (1.0f / 32.0f);
- }
- }
- }
- }
-
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- pch->a[ch] = pch->b[ch] = pch->pe[ch] = pch->thr[ch] = 0.0f;
- for(w = 0; w < ics->num_windows*16; w += 16){
- for(g = 0; g < ics->num_swb; g++){
- Psy3gppBand *band = &pch->band[ch][w+g];
- if(band->energy != 0.0)
- calc_pe(band, ics->swb_sizes[g]);
- if(band->thr < band->energy){
- pch->a[ch] += band->a;
- pch->b[ch] += band->b;
- pch->pe[ch] += band->pe;
- pch->thr[ch] += band->thr;
- }
- }
- }
- }
-
- switch(PSY_MODEL_MODE(apc->flags)){
- case PSY_MODE_CBR:
- case PSY_MODE_ABR:
- //bitrate reduction - 5.6.1 "Reduction of psychoacoustic requirements"
- if(PSY_MODEL_MODE(apc->flags) != PSY_MODE_ABR){
- pctx->reservoir += pctx->avg_bits - apc->avctx->frame_bits;
- bits_avail = pctx->avg_bits + pctx->reservoir;
- bits_avail = FFMIN(bits_avail, pctx->avg_bits * 1.5);
- pe_target = 1.18f * bits_avail / apc->avctx->channels * chans;
- }else{
- pe_target = pctx->avg_bits / apc->avctx->channels * chans;
- }
- for(i = 0; i < 2; i++){
- float t0, pe, r, a0 = 0.0f, pe0 = 0.0f, b0 = 0.0f;
- for(ch = 0; ch < chans; ch++){
- a0 += pch->a[ch];
- b0 += pch->b[ch];
- pe0 += pch->pe[ch];
- }
- if(pe0 == 0.0f) break;
- t0 = pow(2.0, (a0 - pe0) / (4.0 * b0));
- r = pow(2.0, (a0 - pe_target) / (4.0 * b0)) - t0;
-
- //add correction factor to thresholds and recalculate perceptual entropy
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- pch->a[ch] = pch->b[ch] = pch->pe[ch] = pch->thr[ch] = 0.0;
- pe = 0.0f;
- for(w = 0; w < ics->num_windows*16; w += 16){
- for(g = 0; g < ics->num_swb; g++){
- Psy3gppBand *band = &pch->band[ch][w+g];
- band->thr = modify_thr(band->thr, r);
- calc_pe(band, ics->swb_sizes[g]);
- if(band->thr < band->energy){
- pch->a[ch] += band->a;
- pch->b[ch] += band->b;
- pch->pe[ch] += band->pe;
- pch->thr[ch] += band->thr;
- }
- }
- }
- }
- }
-
- //determine scalefactors - 5.6.2 "Scalefactor determination"
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- for(w = 0; w < ics->num_windows*16; w += 16){
- for(g = 0; g < ics->num_swb; g++){
- Psy3gppBand *band = &pch->band[ch][w+g];
- cpe->ch[ch].zeroes[w+g] = band->thr >= band->energy;
- if(cpe->ch[ch].zeroes[w+g]) continue;
- cpe->ch[ch].sf_idx[w+g] = determine_scalefactor(band);
- }
- }
+ for(w = 0; w < wi->num_windows*16; w += 16){
+ Psy3gppBand *band = &pch->band[w];
+ for(g = 1; g < num_bands; g++){
+ band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_low[g-1]);
}
- break;
- case PSY_MODE_QUALITY:
- for(ch = 0; ch < chans; ch++){
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- start = 0;
- for(w = 0; w < ics->num_windows*16; w += 16){
- for(g = 0; g < ics->num_swb; g++){
- Psy3gppBand *band = &pch->band[ch][w+g];
- if(band->thr >= band->energy){
- cpe->ch[ch].sf_idx[w+g] = 0;
- cpe->ch[ch].zeroes[w+g] = 1;
- }else{
- cpe->ch[ch].zeroes[w+g] = 0;
- cpe->ch[ch].sf_idx[w+g] = determine_scalefactor(band);
- while(cpe->ch[ch].sf_idx[w+g] > 3){
- float dist = get_approximate_quant_error(cpe->ch[ch].coeffs + start,
- ics->swb_sizes[g],
- SCALE_ONE_POS + cpe->ch[ch].sf_idx[w+g]);
- if(dist < band->thr) break;
- cpe->ch[ch].sf_idx[w+g] -= 3;
- }
- }
- start += ics->swb_sizes[g];
- }
- }
+ for(g = num_bands - 2; g >= 0; g--){
+ band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_hi [g+1]);
}
- break;
- }
-
- //limit scalefactors
- for(ch = 0; ch < chans; ch++){
- int min_scale = 256;
- IndividualChannelStream *ics = &cpe->ch[ch].ics;
- for(w = 0; w < ics->num_windows*16; w += 16)
- for(g = 0; g < ics->num_swb; g++){
- if(cpe->ch[ch].zeroes[w + g]) continue;
- min_scale = FFMIN(min_scale, cpe->ch[ch].sf_idx[w + g]);
- }
- for(w = 0; w < ics->num_windows*16; w += 16)
- for(g = 0; g < ics->num_swb; g++){
- if(cpe->ch[ch].zeroes[w + g]) continue;
- cpe->ch[ch].sf_idx[w + g] = FFMIN(cpe->ch[ch].sf_idx[w + g], min_scale + SCALE_MAX_DIFF);
- }
- for(w = 0; w < ics->num_windows*16; w += 16)
- for(g = 0; g < ics->num_swb; g++){
- if(cpe->ch[ch].zeroes[w + g])
- cpe->ch[ch].sf_idx[w + g] = 256;
- else
- cpe->ch[ch].sf_idx[w + g] = av_clip(SCALE_ONE_POS + cpe->ch[ch].sf_idx[w + g],
- 0,
- SCALE_MAX_POS);
+ for(g = 0; g < num_bands; g++){
+ band[g].thr_quiet = FFMAX(band[g].thr, coeffs->ath[g]);
+ if(wi->num_windows != 8 && wi->window_type[1] != EIGHT_SHORT_SEQUENCE){
+ band[g].thr_quiet = fmaxf(PSY_3GPP_RPEMIN*band[g].thr_quiet,
+ fminf(band[g].thr_quiet,
+ PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
}
+ band[g].thr = FFMAX(band[g].thr, band[g].thr_quiet * 0.25);
- //adjust scalefactors for window groups
- for(w = 0; w < ics->num_windows; w += ics->group_len[w]){
- int min_scale = 256;
-
- for(g = 0; g < ics->num_swb; g++){
- for(i = w; i < w + ics->group_len[w]; i++){
- if(cpe->ch[ch].zeroes[i*16 + g]) continue;
- min_scale = FFMIN(min_scale, cpe->ch[ch].sf_idx[i*16 + g]);
- }
- for(i = w; i < w + ics->group_len[w]; i++)
- cpe->ch[ch].sf_idx[i*16 + g] = min_scale;
- }
+ ctx->psy_bands[channel*PSY_MAX_BANDS+w+g].threshold = band[g].thr;
}
}
-
memcpy(pch->prev_band, pch->band, sizeof(pch->band));
- psy_create_output(apc, cpe, chans);
}
-static av_cold void psy_3gpp_end(AACPsyContext *apc)
+static av_cold void psy_3gpp_end(FFPsyContext *apc)
{
Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
av_freep(&pctx->ch);
av_freep(&apc->model_priv_data);
}
-static const AACPsyModel psy_models[AAC_NB_PSY_MODELS] =
-{
- {
- "3GPP TS 26.403-inspired model",
- psy_3gpp_init,
- psy_3gpp_window,
- psy_3gpp_process,
- psy_3gpp_end,
- },
-};
-
-int av_cold ff_aac_psy_init(AACPsyContext *ctx, AVCodecContext *avctx,
- enum AACPsyModelType model, int elements, int flags,
- const uint8_t *bands1024, int num_bands1024,
- const uint8_t *bands128, int num_bands128)
-{
- int i;
-
- if(model < 0 || model >= AAC_NB_PSY_MODELS){
- av_log(avctx, AV_LOG_ERROR, "Invalid psy model\n");
- return -1;
- }
-
-#ifndef CONFIG_HARDCODED_TABLES
- for (i = 0; i < 316; i++)
- ff_aac_pow2sf_tab[i] = pow(2, (i - 200)/4.);
-#endif /* CONFIG_HARDCODED_TABLES */
-
- ctx->avctx = avctx;
- ctx->flags = flags;
- ctx->bands1024 = bands1024;
- ctx->num_bands1024 = num_bands1024;
- ctx->bands128 = bands128;
- ctx->num_bands128 = num_bands128;
- ctx->model = &psy_models[model];
-
- if(ctx->flags & PSY_MODEL_NO_ST_ATT || PSY_MODEL_MODE(ctx->flags) == PSY_MODE_QUALITY){
- ctx->flags |= PSY_MODEL_NO_ST_ATT;
- ctx->stereo_att = 0.5f;
- }else{
- ctx->stereo_att = av_clipf(avctx->bit_rate / elements / 192000.0, 0.0f, 0.5f);
- }
- if(ctx->flags & PSY_MODEL_NO_LOWPASS || PSY_MODEL_MODE(ctx->flags) == PSY_MODE_QUALITY){
- ctx->flags |= PSY_MODEL_NO_LOWPASS;
- }else{
- float cutoff = (float)avctx->bit_rate / elements / 8 / avctx->sample_rate;
- ctx->lp_coeffs = ff_lowpass_filter_init_coeffs(4, cutoff);
- if(!ctx->lp_coeffs){
- ctx->flags |= PSY_MODEL_NO_LOWPASS;
- }else{
- ctx->lp_state = av_malloc(sizeof(struct FFLPFilterState*) * elements * 2);
- for(i = 0; i < elements*2; i++)
- ctx->lp_state[i] = ff_lowpass_filter_init_state(4);
- }
- }
- ctx->elements = elements;
- if(ctx->model->init)
- return ctx->model->init(ctx, elements);
- return 0;
-}
-
-void ff_aac_psy_suggest_window(AACPsyContext *ctx, int16_t *audio, int16_t *la,
- int tag, int type, ChannelElement *cpe)
-{
- ctx->model->window(ctx, audio, la, tag, type, cpe);
-}
-
-void ff_aac_psy_analyze(AACPsyContext *ctx, int tag, int type, ChannelElement *cpe)
-{
- ctx->model->process(ctx, tag, type, cpe);
-}
-
-void av_cold ff_aac_psy_end(AACPsyContext *ctx)
-{
- if(!(ctx->flags & PSY_MODEL_NO_LOWPASS)){
- int i;
- ff_lowpass_filter_free_coeffs(ctx->lp_coeffs);
- for(i = 0; i < ctx->elements; i++)
- ff_lowpass_filter_free_state(ctx->lp_state[i]);
- av_freep(&ctx->lp_state);
- }
- if(ctx->model->end)
- return ctx->model->end(ctx);
-}
-void ff_aac_psy_preprocess(AACPsyContext *ctx, int16_t *audio, int16_t *dest, int tag, int type)
+const FFPsyModel ff_aac_psy_model =
{
- int chans = type == TYPE_CPE ? 2 : 1;
- const int chstride = ctx->avctx->channels;
- int i, ch;
- float t[2];
-
- for(ch = 0; ch < chans; ch++){
- if(!(ctx->flags & PSY_MODEL_NO_LOWPASS)){
- ff_lowpass_filter(ctx->lp_coeffs, ctx->lp_state[tag*2 + ch], 1024,
- audio + ch, chstride,
- dest + ch, chstride);
- }else{
- for(i = 0; i < 1024; i++){
- dest[i * chstride + ch] = audio[i * chstride + ch];
- }
- }
- }
- if(chans == 2 && !(ctx->flags & PSY_MODEL_NO_ST_ATT)){
- for(i = 0; i < 1024; i++){
- t[0] = dest[0] * (0.5 + ctx->stereo_att) + dest[1] * (0.5 - ctx->stereo_att);
- t[1] = dest[0] * (0.5 - ctx->stereo_att) + dest[1] * (0.5 + ctx->stereo_att);
- dest[0] = t[0];
- dest[1] = t[1];
- dest += chstride;
- }
- }
-}
-
+ .name = "3GPP TS 26.403-inspired model",
+ .init = psy_3gpp_init,
+ .window = psy_3gpp_window,
+ .analyze = psy_3gpp_analyze,
+ .end = psy_3gpp_end,
+};
Added: aacenc/psymodel.c
==============================================================================
--- (empty file)
+++ aacenc/psymodel.c Tue Sep 2 08:14:14 2008
@@ -0,0 +1,123 @@
+/*
+ * audio encoder psychoacoustic model
+ * Copyright (C) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "psymodel.h"
+#include "iirfilter.h"
+
+extern const FFPsyModel ff_aac_psy_model;
+
+av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
+ int num_lens,
+ uint8_t **bands, int* num_bands)
+{
+ ctx->avctx = avctx;
+ ctx->psy_bands = av_mallocz(sizeof(FFPsyBand) * PSY_MAX_BANDS * avctx->channels);
+ ctx->bands = av_malloc (sizeof(ctx->bands[0]) * num_lens);
+ ctx->num_bands = av_malloc (sizeof(ctx->num_bands[0]) * num_lens);
+ memcpy(ctx->bands, bands, sizeof(ctx->bands[0]) * num_lens);
+ memcpy(ctx->num_bands, num_bands, sizeof(ctx->num_bands[0]) * num_lens);
+ switch(ctx->avctx->codec_id){
+ case CODEC_ID_AAC:
+ ctx->model = &ff_aac_psy_model;
+ break;
+ }
+ if(ctx->model->init)
+ return ctx->model->init(ctx);
+ return 0;
+}
+
+FFPsyWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
+ const int16_t *audio, const int16_t *la,
+ int channel, int prev_type)
+{
+ return ctx->model->window(ctx, audio, la, channel, prev_type);
+}
+
+void ff_psy_set_band_info(FFPsyContext *ctx, int channel,
+ const float *coeffs, FFPsyWindowInfo *wi)
+{
+ ctx->model->analyze(ctx, channel, coeffs, wi);
+}
+
+av_cold void ff_psy_end(FFPsyContext *ctx)
+{
+ if(ctx->model->end)
+ ctx->model->end(ctx);
+ av_freep(&ctx->bands);
+ av_freep(&ctx->num_bands);
+ av_freep(&ctx->psy_bands);
+}
+
+typedef struct FFPsyPreprocessContext{
+ AVCodecContext *avctx;
+ float stereo_att;
+ struct FFIIRFilterCoeffs *fcoeffs;
+ struct FFIIRFilterState **fstate;
+}FFPsyPreprocessContext;
+
+#define FILT_ORDER 4
+
+av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx)
+{
+ FFPsyPreprocessContext *ctx;
+ int i;
+ ctx = av_mallocz(sizeof(FFPsyPreprocessContext));
+ ctx->avctx = avctx;
+ ctx->fcoeffs = ff_iir_filter_init_coeffs(FF_FILTER_TYPE_BUTTERWORTH, FF_FILTER_MODE_LOWPASS,
+ FILT_ORDER, 0.25, 0.0, 0.0);
+ if(ctx->fcoeffs){
+ ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
+ for(i = 0; i < avctx->channels; i++)
+ ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
+ }
+ return ctx;
+}
+
+void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx,
+ const int16_t *audio, int16_t *dest,
+ int tag, int channels)
+{
+ int ch, i;
+ if(ctx->fstate){
+ for(ch = 0; ch < channels; ch++){
+ ff_iir_filter(ctx->fcoeffs, ctx->fstate[tag+ch], ctx->avctx->frame_size,
+ audio + ch, ctx->avctx->channels,
+ dest + ch, ctx->avctx->channels);
+ }
+ }else{
+ for(ch = 0; ch < channels; ch++){
+ for(i = 0; i < ctx->avctx->frame_size; i++)
+ dest[i*ctx->avctx->channels + ch] = audio[i*ctx->avctx->channels + ch];
+ }
+ }
+}
+
+av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx)
+{
+ int i;
+ ff_iir_filter_free_coeffs(ctx->fcoeffs);
+ for(i = 0; i < ctx->avctx->channels; i++){
+ ff_iir_filter_free_state(ctx->fstate[i]);
+ }
+ av_freep(&ctx->fstate);
+}
+
Added: aacenc/psymodel.h
==============================================================================
--- (empty file)
+++ aacenc/psymodel.h Tue Sep 2 08:14:14 2008
@@ -0,0 +1,158 @@
+/*
+ * audio encoder psychoacoustic model
+ * Copyright (C) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef FFMPEG_PSYMODEL_H
+#define FFMPEG_PSYMODEL_H
+
+#include "avcodec.h"
+
+/** maximum possible number of bands */
+#define PSY_MAX_BANDS 128
+
+/**
+ * single band psychoacoustic information
+ */
+typedef struct FFPsyBand{
+ int bits;
+ float energy;
+ float threshold;
+ float distortion;
+ float perceptual_weight;
+}FFPsyBand;
+
+/**
+ * windowing related information
+ */
+typedef struct FFPsyWindowInfo{
+ int window_type[3]; ///< window type (short/long/transitional, etc.) - current, previous and next
+ int window_shape; ///< window shape (sine/KBD/whatever)
+ int num_windows; ///< number of windows in a frame
+ int grouping[8]; ///< window grouping (for e.g. AAC)
+ int *window_sizes; ///< sequence of window sizes inside one frame (for eg. WMA)
+}FFPsyWindowInfo;
+
+/**
+ * context used by psychoacoustic model
+ */
+typedef struct FFPsyContext{
+ AVCodecContext *avctx; ///< encoder context
+ const struct FFPsyModel *model; ///< encoder-specific model functions
+
+ FFPsyBand *psy_bands; ///< frame bands information
+
+ uint8_t **bands; ///< scalefactor band sizes for possible frame sizes
+ int *num_bands; ///< number of scalefactor bands for possible frame sizes
+ int num_lens; ///< number of scalefactor band sets
+
+ void* model_priv_data; ///< psychoacoustic model implementation private data
+}FFPsyContext;
+
+/**
+ * codec-specific psychoacoustic model implementation
+ */
+typedef struct FFPsyModel {
+ const char *name;
+ int (*init) (FFPsyContext *apc);
+ FFPsyWindowInfo (*window)(FFPsyContext *ctx, const int16_t *audio, const int16_t *la, int channel, int prev_type);
+ void (*analyze)(FFPsyContext *ctx, int channel, const float *coeffs, FFPsyWindowInfo *wi);
+ void (*end) (FFPsyContext *apc);
+}FFPsyModel;
+
+/**
+ * Initialize psychoacoustic model.
+ *
+ * @param ctx model context
+ * @param avctx codec context
+ * @param num_lens number of possible frame lengths
+ * @param bands scalefactor band lengths for all frame lengths
+ * @param num_bands number of scalefactor bands for all frame lengths
+ *
+ * @return zero if successful, a negative value if not
+ */
+av_cold int ff_psy_init(FFPsyContext *ctx, AVCodecContext *avctx,
+ int num_lens,
+ uint8_t **bands, int* num_bands);
+
+/**
+ * Suggest window sequence for channel.
+ *
+ * @param ctx model context
+ * @param audio samples for the current frame
+ * @param la lookahead samples (NULL when unavailable)
+ * @param channel number of channel element to analyze
+ * @param prev_type previous window type
+ *
+ * @return suggested window information in a structure
+ */
+FFPsyWindowInfo ff_psy_suggest_window(FFPsyContext *ctx,
+ const int16_t *audio, const int16_t *la,
+ int channel, int prev_type);
+
+
+/**
+ * Perform psychoacoustic analysis and set band info (threshold, energy).
+ *
+ * @param ctx model context
+ * @param channel audio channel number
+ * @param coeffs pointer to the transformed coefficients
+ * @param wi window information
+ */
+void ff_psy_set_band_info(FFPsyContext *ctx, int channel, const float *coeffs,
+ FFPsyWindowInfo *wi);
+
+/**
+ * Cleanup model context at the end.
+ *
+ * @param ctx model context
+ */
+av_cold void ff_psy_end(FFPsyContext *ctx);
+
+
+/**************************************************************************
+ * Audio preprocessing stuff. *
+ * This should be moved into some audio filter eventually. *
+ **************************************************************************/
+struct FFPsyPreprocessContext;
+
+/**
+ * psychoacoustic model audio preprocessing initialization
+ */
+av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *avctx);
+
+/**
+ * Preprocess several channel in audio frame in order to compress it better.
+ *
+ * @param ctx preprocessing context
+ * @param audio samples to preprocess
+ * @param dest place to put filtered samples
+ * @param tag channel number
+ * @param channels number of channel to preprocess (some additional work may be done on stereo pair)
+ */
+void ff_psy_preprocess(struct FFPsyPreprocessContext *ctx,
+ const int16_t *audio, int16_t *dest,
+ int tag, int channels);
+
+/**
+ * Cleanup audio preprocessing module.
+ */
+av_cold void ff_psy_preprocess_end(struct FFPsyPreprocessContext *ctx);
+
+#endif /* FFMPEG_PSYMODEL_H */
More information about the FFmpeg-soc
mailing list