[FFmpeg-devel] [PATCH 2/3] [GSoC] [AAC] aaccoder: Implement Perceptual Noise Substitution

Rostislav Pehlivanov atomnuker at gmail.com
Sun Apr 12 06:50:35 CEST 2015


This commit enables the use of the pseudo-codebook NOISE_BT for encoding noise values for the twoloop coder. It uses the energy values from the psychoacoustic model to determine whether it's acceptible to use noise for encoding and if so, determine the energy of the noise. The cost system was modified to accept the 13th codebook (skipping the nonexistant 12). The system was extended such that in the future it should be easy to add support for intensity stereo coding, hence the use of arrays for the maps.

The parameters used (such as the factor by which uplims is multiplied when comparing and the cost returned by the BT_NOISE case) and the way energy values are converted to scalefactor indices have not been extensively tested, so safe values which should not break anything were used. They are to be tweaked in the future to optimize audio quality if needed.
---
 libavcodec/aaccoder.c | 128 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 42 deletions(-)

diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index 64eee32..f7662fd 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -40,6 +40,9 @@
 #include "aacenc.h"
 #include "aactab.h"
 
+/** Total number of usable codebooks **/
+#define CB_TOT 13
+
 /** bits needed to code codebook run value for long windows */
 static const uint8_t run_value_bits_long[64] = {
      5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
@@ -57,6 +60,10 @@ static const uint8_t * const run_value_bits[2] = {
     run_value_bits_long, run_value_bits_short
 };
 
+/** Map to convert values from BandCodingPath index to a codebook index **/
+static const uint8_t aac_cb_out_map[CB_TOT]  = {0,1,2,3,4,5,6,7,8,9,10,11,13};
+/** Inverse map to convert from codebooks to BandCodingPath indices **/
+static const uint8_t aac_cb_in_map[CB_TOT+1] = {0,1,2,3,4,5,6,7,8,9,10,11,0,12};
 
 /**
  * Quantize one coefficient.
@@ -108,7 +115,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
                                 int *bits, int BT_ZERO, int BT_UNSIGNED,
-                                int BT_PAIR, int BT_ESC)
+                                int BT_PAIR, int BT_ESC, int BT_NOISE)
 {
     const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512;
     const float Q   = ff_aac_pow2sf_tab [q_idx];
@@ -119,8 +126,6 @@ static av_always_inline float quantize_and_encode_band_cost_template(
     float cost = 0;
     const int dim = BT_PAIR ? 2 : 4;
     int resbits = 0;
-    const int range  = aac_cb_range[cb];
-    const int maxval = aac_cb_maxval[cb];
     int off;
 
     if (BT_ZERO) {
@@ -130,15 +135,22 @@ static av_always_inline float quantize_and_encode_band_cost_template(
             *bits = 0;
         return cost * lambda;
     }
+    if (BT_NOISE) {
+        for (i = 0; i < size; i++)
+            cost += in[i]*in[i];
+        if (bits)
+            *bits = 0;
+        return cost * lambda;
+    }
     if (!scaled) {
         abs_pow34_v(s->scoefs, in, size);
         scaled = s->scoefs;
     }
-    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, maxval);
+    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, aac_cb_maxval[cb]);
     if (BT_UNSIGNED) {
         off = 0;
     } else {
-        off = maxval;
+        off = aac_cb_maxval[cb];
     }
     for (i = 0; i < size; i += dim) {
         const float *vec;
@@ -147,7 +159,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
         int curbits;
         float rd = 0.0f;
         for (j = 0; j < dim; j++) {
-            curidx *= range;
+            curidx *= aac_cb_range[cb];
             curidx += quants[j] + off;
         }
         curbits =  ff_aac_spectral_bits[cb-1][curidx];
@@ -207,8 +219,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(
     return cost;
 }
 
-#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC) \
-static float quantize_and_encode_band_cost_ ## NAME(                                        \
+#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE) \
+static float quantize_and_encode_band_cost_ ## NAME(                                    \
                                 struct AACEncContext *s,                                \
                                 PutBitContext *pb, const float *in,                     \
                                 const float *scaled, int size, int scale_idx,           \
@@ -217,15 +229,16 @@ static float quantize_and_encode_band_cost_ ## NAME(
     return quantize_and_encode_band_cost_template(                                      \
                                 s, pb, in, scaled, size, scale_idx,                     \
                                 BT_ESC ? ESC_BT : cb, lambda, uplim, bits,              \
-                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC);                 \
+                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE);       \
 }
 
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0)
-QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0, 0)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0, 0)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0, 0)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0, 0)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0, 0)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1, 0)
+QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NOISE, 0, 0, 0, 0, 1)
 
 static float (*const quantize_and_encode_band_cost_arr[])(
                                 struct AACEncContext *s,
@@ -245,6 +258,8 @@ static float (*const quantize_and_encode_band_cost_arr[])(
     quantize_and_encode_band_cost_UPAIR,
     quantize_and_encode_band_cost_UPAIR,
     quantize_and_encode_band_cost_ESC,
+    NULL,
+    quantize_and_encode_band_cost_NOISE,
 };
 
 #define quantize_and_encode_band_cost(                                  \
@@ -312,7 +327,7 @@ typedef struct BandCodingPath {
 static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce,
                                      int win, int group_len, const float lambda)
 {
-    BandCodingPath path[120][12];
+    BandCodingPath path[120][CB_TOT];
     int w, swb, cb, start, size;
     int i, j;
     const int max_sfb  = sce->ics.max_sfb;
@@ -325,7 +340,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
 
     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
     start = win*128;
-    for (cb = 0; cb < 12; cb++) {
+    for (cb = 0; cb < CB_TOT; cb++) {
         path[0][cb].cost     = 0.0f;
         path[0][cb].prev_idx = -1;
         path[0][cb].run      = 0;
@@ -333,7 +348,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     for (swb = 0; swb < max_sfb; swb++) {
         size = sce->ics.swb_sizes[swb];
         if (sce->zeroes[win*16 + swb]) {
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT; cb++) {
                 path[swb+1][cb].prev_idx = cb;
                 path[swb+1][cb].cost     = path[swb][cb].cost;
                 path[swb+1][cb].run      = path[swb][cb].run + 1;
@@ -343,14 +358,14 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
             int mincb = next_mincb;
             next_minrd = INFINITY;
             next_mincb = 0;
-            for (cb = 0; cb < 12; cb++) {
+            for (cb = 0; cb < CB_TOT; cb++) {
                 float cost_stay_here, cost_get_here;
                 float rd = 0.0f;
                 for (w = 0; w < group_len; w++) {
                     FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(win+w)*16+swb];
                     rd += quantize_band_cost(s, sce->coeffs + start + w*128,
                                              s->scoefs + start + w*128, size,
-                                             sce->sf_idx[(win+w)*16+swb], cb,
+                                             sce->sf_idx[(win+w)*16+swb], aac_cb_out_map[cb],
                                              lambda / band->threshold, INFINITY, NULL);
                 }
                 cost_stay_here = path[swb][cb].cost + rd;
@@ -379,7 +394,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //convert resulting path from backward-linked list
     stack_len = 0;
     idx       = 0;
-    for (cb = 1; cb < 12; cb++)
+    for (cb = 1; cb < CB_TOT; cb++)
         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
             idx = cb;
     ppos = max_sfb;
@@ -394,12 +409,13 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     //perform actual band info encoding
     start = 0;
     for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
         count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
+        memset(sce->zeroes + win*16 + start, !cb, count);
         //XXX: memset when band_type is also uint8_t
         for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
+            sce->band_type[win*16 + start] = cb;
             start++;
         }
         while (count >= run_esc) {
@@ -413,7 +429,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
 static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
                                   int win, int group_len, const float lambda)
 {
-    BandCodingPath path[120][12];
+    BandCodingPath path[120][CB_TOT];
     int w, swb, cb, start, size;
     int i, j;
     const int max_sfb  = sce->ics.max_sfb;
@@ -426,7 +442,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
 
     abs_pow34_v(s->scoefs, sce->coeffs, 1024);
     start = win*128;
-    for (cb = 0; cb < 12; cb++) {
+    for (cb = 0; cb < CB_TOT; cb++) {
         path[0][cb].cost     = run_bits+4;
         path[0][cb].prev_idx = -1;
         path[0][cb].run      = 0;
@@ -450,7 +466,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
             }
             next_minbits = path[swb+1][0].cost;
             next_mincb = 0;
-            for (cb = 1; cb < 12; cb++) {
+            for (cb = 1; cb < CB_TOT; cb++) {
                 path[swb+1][cb].cost = 61450;
                 path[swb+1][cb].prev_idx = -1;
                 path[swb+1][cb].run = 0;
@@ -459,6 +475,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
             float minbits = next_minbits;
             int mincb = next_mincb;
             int startcb = sce->band_type[win*16+swb];
+            startcb = aac_cb_in_map[startcb];
             next_minbits = INFINITY;
             next_mincb = 0;
             for (cb = 0; cb < startcb; cb++) {
@@ -466,13 +483,20 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
                 path[swb+1][cb].prev_idx = -1;
                 path[swb+1][cb].run = 0;
             }
-            for (cb = startcb; cb < 12; cb++) {
+            for (cb = startcb; cb < CB_TOT; cb++) {
                 float cost_stay_here, cost_get_here;
                 float bits = 0.0f;
+                if (cb == 12 && sce->band_type[win*16+swb] != NOISE_BT) {
+                    path[swb+1][cb].cost = 61450;
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].run = 0;
+                    continue;
+                }
                 for (w = 0; w < group_len; w++) {
                     bits += quantize_band_cost(s, sce->coeffs + start + w*128,
                                                s->scoefs + start + w*128, size,
-                                               sce->sf_idx[(win+w)*16+swb], cb,
+                                               sce->sf_idx[(win+w)*16+swb],
+                                               aac_cb_out_map[cb],
                                                0, INFINITY, NULL);
                 }
                 cost_stay_here = path[swb][cb].cost + bits;
@@ -501,7 +525,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
     //convert resulting path from backward-linked list
     stack_len = 0;
     idx       = 0;
-    for (cb = 1; cb < 12; cb++)
+    for (cb = 1; cb < CB_TOT; cb++)
         if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
             idx = cb;
     ppos = max_sfb;
@@ -517,12 +541,13 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
     //perform actual band info encoding
     start = 0;
     for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
         count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
+        memset(sce->zeroes + win*16 + start, !cb, count);
         //XXX: memset when band_type is also uint8_t
         for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
+            sce->band_type[win*16 + start] = cb;
             start++;
         }
         while (count >= run_esc) {
@@ -711,8 +736,9 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
 {
     int start = 0, i, w, w2, g;
     int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128];
+    float dists[128] = { 0 }, uplims[128] = { 0 }, energies[128] = { 0 };
     float maxvals[128];
+    float energy_avg = 0;
     int fflag, minscaler;
     int its  = 0;
     int allz = 0;
@@ -724,32 +750,47 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
     //XXX: some heuristic to determine initial quantizers will reduce search time
     //determine zero bands and upper limits
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = 0;
         for (g = 0;  g < sce->ics.num_swb; g++) {
             int nz = 0;
-            float uplim = 0.0f;
+            float uplim = 0.0f, energy = 0.0f;
+            float freq = (w*16+g)*(avctx->sample_rate/(1024/sce->ics.num_windows)/2);
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                 uplim += band->threshold;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
+                energy += band->energy;
+                if (band->threshold == 0.0f || band->energy < band->threshold) {
                     sce->zeroes[(w+w2)*16+g] = 1;
                     continue;
                 }
                 nz = 1;
             }
             uplims[w*16+g] = uplim *512;
+            energies[w*16+g] = log2f(2*(energy*energy));
+            energy_avg = (energies[w*16+g] + energy_avg)/2;
+            if (freq > 4000.0f && energy <= uplim * 1.52f) {
+                sce->band_type[w*16+g] = NOISE_BT;
+                nz = 1;
+            } else { /* Will be determined in the two-loop search */
+                sce->band_type[w*16+g] = 0;
+            }
             sce->zeroes[w*16+g] = !nz;
             if (nz)
                 minthr = FFMIN(minthr, uplim);
             allz |= nz;
+            start += sce->ics.swb_sizes[g];
         }
     }
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
         for (g = 0;  g < sce->ics.num_swb; g++) {
             if (sce->zeroes[w*16+g]) {
                 sce->sf_idx[w*16+g] = SCALE_ONE_POS;
-                continue;
+            } else if (sce->band_type[w*16+g] == NOISE_BT) {
+                float energy_norm = (energies[w*16+g]/energy_avg);
+                sce->sf_idx[w*16+g] = av_clip((energy_norm*256) - 70, -100, 155);
+            } else {
+                sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
             }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
         }
     }
 
@@ -785,7 +826,8 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                     int cb;
                     float dist = 0.0f;
 
-                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                    if (sce->zeroes[w*16+g] || sce->band_type[w*16+g] >= NOISE_BT ||
+                        sce->sf_idx[w*16+g] >= 218) {
                         start += sce->ics.swb_sizes[g];
                         continue;
                     }
@@ -814,11 +856,11 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
             }
             if (tbits > destbits) {
                 for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
+                    if (sce->sf_idx[i] < 218 - qstep && sce->band_type[i] < NOISE_BT)
                         sce->sf_idx[i] += qstep;
             } else {
                 for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
+                    if (sce->sf_idx[i] > 60 - qstep && sce->band_type[i] < NOISE_BT)
                         sce->sf_idx[i] -= qstep;
             }
             qstep >>= 1;
@@ -831,7 +873,7 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
         for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
             for (g = 0; g < sce->ics.num_swb; g++) {
                 int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
+                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60 && sce->band_type[w*16+g] < NOISE_BT) {
                     if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
                         sce->sf_idx[w*16+g]--;
                     else //Try to make sure there is some energy in every band
@@ -839,6 +881,8 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                 }
                 sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
                 sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
+                if (sce->band_type[w*16+g] >= NOISE_BT)
+                    continue;
                 if (sce->sf_idx[w*16+g] != prevsc)
                     fflag = 1;
                 sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-- 
2.1.4



More information about the ffmpeg-devel mailing list