[FFmpeg-cvslog] AAC encoder: refactor to resynchronize MIPS port

Thu Sep 17 04:24:19 CEST 2015

ffmpeg | branch: master | Claudio Freire <klaussfreire at gmail.com> | Tue Sep 15 03:59:45 2015 -0300| [8df9bf8e39b5f38c75d63c0ef17965382e634b1c] | committer: Claudio Freire

AAC encoder: refactor to resynchronize MIPS port

This patch refactors the AAC coders to reuse code
between the MIPS port and the regular, portable C code.
There were two main functions that had to use
hand-optimized versions of quantization code:
 - search_for_quantizers_twoloop
 - codebook_trellis_rate

Those two were split into their own template header
files so they can be inlined inside both the MIPS port
and the generic code. In each context, they'll link
to their specialized implementations, and thus be
optimized by the compiler.

This approach I believe is better than maintaining
several copies of each function. As past experience has
proven, having to keep those in sync was error prone.
In this way, they will remain in sync by default.

Also, an implementation of the dequantized output
argument for the optimized quantize_and_encode
functions is included in the patch. While the current
implementation of search_for_pred still isn't using
it, future iterations of main prediction probably will.
It should not imply any measurable performance hit while
not being used.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8df9bf8e39b5f38c75d63c0ef17965382e634b1c
---

 libavcodec/aaccoder.c            |  284 +-----------------------
 libavcodec/aaccoder_trellis.h    |  194 ++++++++++++++++
 libavcodec/aaccoder_twoloop.h    |  203 +++++++++++++++++
 libavcodec/aacenc_quantization.h |   14 ++
 libavcodec/mips/aaccoder_mips.c  |  452 ++++++++++----------------------------
 tests/fate/aac.mak               |   15 +-
 6 files changed, 538 insertions(+), 624 deletions(-)

diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index 524987d..8d5ea77 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -48,6 +48,8 @@
 #include "aacenc_tns.h"
 #include "aacenc_pred.h"
 
+#include "libavcodec/aaccoder_twoloop.h"
+
 /** Frequency in Hz for lower limit of noise substitution **/
 #define NOISE_LOW_LIMIT 4000
 
@@ -59,6 +61,8 @@
  * replace low energy non zero bands */
 #define NOISE_LAMBDA_REPLACE 1.948f
 
+#include "libavcodec/aaccoder_trellis.h"
+
 /**
  * structure used in optimal codebook search
  */
@@ -181,137 +185,6 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
     }
 }
 
-static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
-                                  int win, int group_len, const float lambda)
-{
-    BandCodingPath path[120][CB_TOT_ALL];
-    int w, swb, cb, start, size;
-    int i, j;
-    const int max_sfb  = sce->ics.max_sfb;
-    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
-    const int run_esc  = (1 << run_bits) - 1;
-    int idx, ppos, count;
-    int stackrun[120], stackcb[120], stack_len;
-    float next_minbits = INFINITY;
-    int next_mincb = 0;
-
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    start = win*128;
-    for (cb = 0; cb < CB_TOT_ALL; cb++) {
-        path[0][cb].cost     = run_bits+4;
-        path[0][cb].prev_idx = -1;
-        path[0][cb].run      = 0;
-    }
-    for (swb = 0; swb < max_sfb; swb++) {
-        size = sce->ics.swb_sizes[swb];
-        if (sce->zeroes[win*16 + swb]) {
-            float cost_stay_here = path[swb][0].cost;
-            float cost_get_here  = next_minbits + run_bits + 4;
-            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
-                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
-                cost_stay_here += run_bits;
-            if (cost_get_here < cost_stay_here) {
-                path[swb+1][0].prev_idx = next_mincb;
-                path[swb+1][0].cost     = cost_get_here;
-                path[swb+1][0].run      = 1;
-            } else {
-                path[swb+1][0].prev_idx = 0;
-                path[swb+1][0].cost     = cost_stay_here;
-                path[swb+1][0].run      = path[swb][0].run + 1;
-            }
-            next_minbits = path[swb+1][0].cost;
-            next_mincb = 0;
-            for (cb = 1; cb < CB_TOT_ALL; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-        } else {
-            float minbits = next_minbits;
-            int mincb = next_mincb;
-            int startcb = sce->band_type[win*16+swb];
-            startcb = aac_cb_in_map[startcb];
-            next_minbits = INFINITY;
-            next_mincb = 0;
-            for (cb = 0; cb < startcb; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-            for (cb = startcb; cb < CB_TOT_ALL; cb++) {
-                float cost_stay_here, cost_get_here;
-                float bits = 0.0f;
-                if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
-                    path[swb+1][cb].cost = 61450;
-                    path[swb+1][cb].prev_idx = -1;
-                    path[swb+1][cb].run = 0;
-                    continue;
-                }
-                for (w = 0; w < group_len; w++) {
-                    bits += quantize_band_cost(s, &sce->coeffs[start + w*128],
-                                               &s->scoefs[start + w*128], size,
-                                               sce->sf_idx[win*16+swb],
-                                               aac_cb_out_map[cb],
-                                               0, INFINITY, NULL, 0);
-                }
-                cost_stay_here = path[swb][cb].cost + bits;
-                cost_get_here  = minbits            + bits + run_bits + 4;
-                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
-                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
-                    cost_stay_here += run_bits;
-                if (cost_get_here < cost_stay_here) {
-                    path[swb+1][cb].prev_idx = mincb;
-                    path[swb+1][cb].cost     = cost_get_here;
-                    path[swb+1][cb].run      = 1;
-                } else {
-                    path[swb+1][cb].prev_idx = cb;
-                    path[swb+1][cb].cost     = cost_stay_here;
-                    path[swb+1][cb].run      = path[swb][cb].run + 1;
-                }
-                if (path[swb+1][cb].cost < next_minbits) {
-                    next_minbits = path[swb+1][cb].cost;
-                    next_mincb = cb;
-                }
-            }
-        }
-        start += sce->ics.swb_sizes[swb];
-    }
-
-    //convert resulting path from backward-linked list
-    stack_len = 0;
-    idx       = 0;
-    for (cb = 1; cb < CB_TOT_ALL; cb++)
-        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
-            idx = cb;
-    ppos = max_sfb;
-    while (ppos > 0) {
-        av_assert1(idx >= 0);
-        cb = idx;
-        stackrun[stack_len] = path[ppos][cb].run;
-        stackcb [stack_len] = cb;
-        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
-        ppos -= path[ppos][cb].run;
-        stack_len++;
-    }
-    //perform actual band info encoding
-    start = 0;
-    for (i = stack_len - 1; i >= 0; i--) {
-        cb = aac_cb_out_map[stackcb[i]];
-        put_bits(&s->pb, 4, cb);
-        count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !cb, count);
-        //XXX: memset when band_type is also uint8_t
-        for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] = cb;
-            start++;
-        }
-        while (count >= run_esc) {
-            put_bits(&s->pb, run_bits, run_esc);
-            count -= run_esc;
-        }
-        put_bits(&s->pb, run_bits, count);
-    }
-}
 
 typedef struct TrellisPath {
     float cost;
@@ -508,155 +381,6 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                 sce->sf_idx[(w+w2)*16+g] = sce->sf_idx[w*16+g];
 }
 
-/**
- * two-loop quantizers search taken from ISO 13818-7 Appendix C
- */
-static void search_for_quantizers_twoloop(AVCodecContext *avctx,
-                                          AACEncContext *s,
-                                          SingleChannelElement *sce,
-                                          const float lambda)
-{
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128] = { 0 };
-    float maxvals[128];
-    int fflag, minscaler;
-    int its  = 0;
-    int allz = 0;
-    float minthr = INFINITY;
-
-    // for values above this the decoder might end up in an endless loop
-    // due to always having more bits than what can be encoded.
-    destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
-    //determine zero bands and upper limits
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            int nz = 0;
-            float uplim = 0.0f, energy = 0.0f;
-            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim  += band->threshold;
-                energy += band->energy;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
-                    sce->zeroes[(w+w2)*16+g] = 1;
-                    continue;
-                }
-                nz = 1;
-            }
-            uplims[w*16+g] = uplim *512;
-            sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
-            allz |= nz;
-        }
-    }
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            if (sce->zeroes[w*16+g]) {
-                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
-                continue;
-            }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
-        }
-    }
-
-    if (!allz)
-        return;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *scaled = s->scoefs + start;
-            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
-            start += sce->ics.swb_sizes[g];
-        }
-    }
-
-    //perform two-loop search
-    //outer loop - improve quality
-    do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
-        //inner loop - quantize spectrum to fit into given number of bits
-        qstep = its ? 1 : 32;
-        do {
-            int prev = -1;
-            tbits = 0;
-            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                start = w*128;
-                for (g = 0;  g < sce->ics.num_swb; g++) {
-                    const float *coefs = &sce->coeffs[start];
-                    const float *scaled = &s->scoefs[start];
-                    int bits = 0;
-                    int cb;
-                    float dist = 0.0f;
-
-                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                        start += sce->ics.swb_sizes[g];
-                        continue;
-                    }
-                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                        int b;
-                        dist += quantize_band_cost(s, coefs + w2*128,
-                                                   scaled + w2*128,
-                                                   sce->ics.swb_sizes[g],
-                                                   sce->sf_idx[w*16+g],
-                                                   cb,
-                                                   1.0f,
-                                                   INFINITY,
-                                                   &b,
-                                                   0);
-                        bits += b;
-                    }
-                    dists[w*16+g] = dist - bits;
-                    if (prev != -1) {
-                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                    }
-                    tbits += bits;
-                    start += sce->ics.swb_sizes[g];
-                    prev = sce->sf_idx[w*16+g];
-                }
-            }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
-            }
-            qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
-                qstep = 1;
-        } while (qstep);
-
-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
-
-        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-            for (g = 0; g < sce->ics.num_swb; g++) {
-                int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else //Try to make sure there is some energy in every band
-                        sce->sf_idx[w*16+g]-=2;
-                }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
-                if (sce->sf_idx[w*16+g] != prevsc)
-                    fflag = 1;
-                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-            }
-        }
-        its++;
-    } while (fflag && its < 10);
-}
 
 static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
                                        SingleChannelElement *sce,
diff --git a/libavcodec/aaccoder_trellis.h b/libavcodec/aaccoder_trellis.h
new file mode 100644
index 0000000..7d685eb
--- /dev/null
+++ b/libavcodec/aaccoder_trellis.h
@@ -0,0 +1,194 @@
+/*
+ * AAC encoder trellis codebook selector
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder trellis codebook selector
+ * @author Konstantin Shishkov
+ */
+
+/**
+ * This file contains a template for the codebook_trellis_rate selector function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost_bits
+ *  - abs_pow34_v
+ */
+
+#ifndef AVCODEC_AACCODER_TRELLIS_H
+#define AVCODEC_AACCODER_TRELLIS_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+#include "aac_tablegen_decl.h"
+
+
+/**
+ * structure used in optimal codebook search
+ */
+typedef struct TrellisBandCodingPath {
+    int prev_idx; ///< pointer to the previous path point
+    float cost;   ///< path cost
+    int run;
+} TrellisBandCodingPath;
+
+
+static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
+                                  int win, int group_len, const float lambda)
+{
+    TrellisBandCodingPath path[120][CB_TOT_ALL];
+    int w, swb, cb, start, size;
+    int i, j;
+    const int max_sfb  = sce->ics.max_sfb;
+    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
+    const int run_esc  = (1 << run_bits) - 1;
+    int idx, ppos, count;
+    int stackrun[120], stackcb[120], stack_len;
+    float next_minbits = INFINITY;
+    int next_mincb = 0;
+
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+    start = win*128;
+    for (cb = 0; cb < CB_TOT_ALL; cb++) {
+        path[0][cb].cost     = run_bits+4;
+        path[0][cb].prev_idx = -1;
+        path[0][cb].run      = 0;
+    }
+    for (swb = 0; swb < max_sfb; swb++) {
+        size = sce->ics.swb_sizes[swb];
+        if (sce->zeroes[win*16 + swb]) {
+            float cost_stay_here = path[swb][0].cost;
+            float cost_get_here  = next_minbits + run_bits + 4;
+            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
+                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
+                cost_stay_here += run_bits;
+            if (cost_get_here < cost_stay_here) {
+                path[swb+1][0].prev_idx = next_mincb;
+                path[swb+1][0].cost     = cost_get_here;
+                path[swb+1][0].run      = 1;
+            } else {
+                path[swb+1][0].prev_idx = 0;
+                path[swb+1][0].cost     = cost_stay_here;
+                path[swb+1][0].run      = path[swb][0].run + 1;
+            }
+            next_minbits = path[swb+1][0].cost;
+            next_mincb = 0;
+            for (cb = 1; cb < CB_TOT_ALL; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+        } else {
+            float minbits = next_minbits;
+            int mincb = next_mincb;
+            int startcb = sce->band_type[win*16+swb];
+            startcb = aac_cb_in_map[startcb];
+            next_minbits = INFINITY;
+            next_mincb = 0;
+            for (cb = 0; cb < startcb; cb++) {
+                path[swb+1][cb].cost = 61450;
+                path[swb+1][cb].prev_idx = -1;
+                path[swb+1][cb].run = 0;
+            }
+            for (cb = startcb; cb < CB_TOT_ALL; cb++) {
+                float cost_stay_here, cost_get_here;
+                float bits = 0.0f;
+                if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
+                    path[swb+1][cb].cost = 61450;
+                    path[swb+1][cb].prev_idx = -1;
+                    path[swb+1][cb].run = 0;
+                    continue;
+                }
+                for (w = 0; w < group_len; w++) {
+                    bits += quantize_band_cost_bits(s, &sce->coeffs[start + w*128],
+                                               &s->scoefs[start + w*128], size,
+                                               sce->sf_idx[win*16+swb],
+                                               aac_cb_out_map[cb],
+                                               0, INFINITY, NULL, 0);
+                }
+                cost_stay_here = path[swb][cb].cost + bits;
+                cost_get_here  = minbits            + bits + run_bits + 4;
+                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
+                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
+                    cost_stay_here += run_bits;
+                if (cost_get_here < cost_stay_here) {
+                    path[swb+1][cb].prev_idx = mincb;
+                    path[swb+1][cb].cost     = cost_get_here;
+                    path[swb+1][cb].run      = 1;
+                } else {
+                    path[swb+1][cb].prev_idx = cb;
+                    path[swb+1][cb].cost     = cost_stay_here;
+                    path[swb+1][cb].run      = path[swb][cb].run + 1;
+                }
+                if (path[swb+1][cb].cost < next_minbits) {
+                    next_minbits = path[swb+1][cb].cost;
+                    next_mincb = cb;
+                }
+            }
+        }
+        start += sce->ics.swb_sizes[swb];
+    }
+
+    //convert resulting path from backward-linked list
+    stack_len = 0;
+    idx       = 0;
+    for (cb = 1; cb < CB_TOT_ALL; cb++)
+        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
+            idx = cb;
+    ppos = max_sfb;
+    while (ppos > 0) {
+        av_assert1(idx >= 0);
+        cb = idx;
+        stackrun[stack_len] = path[ppos][cb].run;
+        stackcb [stack_len] = cb;
+        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
+        ppos -= path[ppos][cb].run;
+        stack_len++;
+    }
+    //perform actual band info encoding
+    start = 0;
+    for (i = stack_len - 1; i >= 0; i--) {
+        cb = aac_cb_out_map[stackcb[i]];
+        put_bits(&s->pb, 4, cb);
+        count = stackrun[i];
+        memset(sce->zeroes + win*16 + start, !cb, count);
+        //XXX: memset when band_type is also uint8_t
+        for (j = 0; j < count; j++) {
+            sce->band_type[win*16 + start] = cb;
+            start++;
+        }
+        while (count >= run_esc) {
+            put_bits(&s->pb, run_bits, run_esc);
+            count -= run_esc;
+        }
+        put_bits(&s->pb, run_bits, count);
+    }
+}
+
+
+#endif /* AVCODEC_AACCODER_TRELLIS_H */
diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h
new file mode 100644
index 0000000..5ac09dc
--- /dev/null
+++ b/libavcodec/aaccoder_twoloop.h
@@ -0,0 +1,203 @@
+/*
+ * AAC encoder twoloop coder
+ * Copyright (C) 2008-2009 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * AAC encoder twoloop coder
+ * @author Konstantin Shishkov
+ */
+
+/**
+ * This file contains a template for the twoloop coder function.
+ * It needs to be provided, externally, as an already included declaration,
+ * the following functions from aacenc_quantization/util.h. They're not included
+ * explicitly here to make it possible to provide alternative implementations:
+ *  - quantize_band_cost
+ *  - abs_pow34_v
+ *  - find_max_val
+ *  - find_min_book
+ */
+
+#ifndef AVCODEC_AACCODER_TWOLOOP_H
+#define AVCODEC_AACCODER_TWOLOOP_H
+
+#include <float.h>
+#include "libavutil/mathematics.h"
+#include "avcodec.h"
+#include "put_bits.h"
+#include "aac.h"
+#include "aacenc.h"
+#include "aactab.h"
+#include "aacenctab.h"
+#include "aac_tablegen_decl.h"
+
+
+/**
+ * two-loop quantizers search taken from ISO 13818-7 Appendix C
+ */
+static void search_for_quantizers_twoloop(AVCodecContext *avctx,
+                                          AACEncContext *s,
+                                          SingleChannelElement *sce,
+                                          const float lambda)
+{
+    int start = 0, i, w, w2, g;
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
+    float dists[128] = { 0 }, uplims[128] = { 0 };
+    float maxvals[128];
+    int fflag, minscaler;
+    int its  = 0;
+    int allz = 0;
+    float minthr = INFINITY;
+
+    // for values above this the decoder might end up in an endless loop
+    // due to always having more bits than what can be encoded.
+    destbits = FFMIN(destbits, 5800);
+    //XXX: some heuristic to determine initial quantizers will reduce search time
+    //determine zero bands and upper limits
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            int nz = 0;
+            float uplim = 0.0f, energy = 0.0f;
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                uplim  += band->threshold;
+                energy += band->energy;
+                if (band->energy <= band->threshold || band->threshold == 0.0f) {
+                    sce->zeroes[(w+w2)*16+g] = 1;
+                    continue;
+                }
+                nz = 1;
+            }
+            uplims[w*16+g] = uplim *512;
+            sce->zeroes[w*16+g] = !nz;
+            if (nz)
+                minthr = FFMIN(minthr, uplim);
+            allz |= nz;
+        }
+    }
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (sce->zeroes[w*16+g]) {
+                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
+                continue;
+            }
+            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
+        }
+    }
+
+    if (!allz)
+        return;
+    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
+
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            const float *scaled = s->scoefs + start;
+            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    //perform two-loop search
+    //outer loop - improve quality
+    do {
+        int tbits, qstep;
+        minscaler = sce->sf_idx[0];
+        //inner loop - quantize spectrum to fit into given number of bits
+        qstep = its ? 1 : 32;
+        do {
+            int prev = -1;
+            tbits = 0;
+            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                start = w*128;
+                for (g = 0;  g < sce->ics.num_swb; g++) {
+                    const float *coefs = &sce->coeffs[start];
+                    const float *scaled = &s->scoefs[start];
+                    int bits = 0;
+                    int cb;
+                    float dist = 0.0f;
+
+                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                        start += sce->ics.swb_sizes[g];
+                        continue;
+                    }
+                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                        int b;
+                        dist += quantize_band_cost(s, coefs + w2*128,
+                                                   scaled + w2*128,
+                                                   sce->ics.swb_sizes[g],
+                                                   sce->sf_idx[w*16+g],
+                                                   cb,
+                                                   1.0f,
+                                                   INFINITY,
+                                                   &b,
+                                                   0);
+                        bits += b;
+                    }
+                    dists[w*16+g] = dist - bits;
+                    if (prev != -1) {
+                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
+                    }
+                    tbits += bits;
+                    start += sce->ics.swb_sizes[g];
+                    prev = sce->sf_idx[w*16+g];
+                }
+            }
+            if (tbits > destbits) {
+                for (i = 0; i < 128; i++)
+                    if (sce->sf_idx[i] < 218 - qstep)
+                        sce->sf_idx[i] += qstep;
+            } else {
+                for (i = 0; i < 128; i++)
+                    if (sce->sf_idx[i] > 60 - qstep)
+                        sce->sf_idx[i] -= qstep;
+            }
+            qstep >>= 1;
+            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
+                qstep = 1;
+        } while (qstep);
+
+        fflag = 0;
+        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
+
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0; g < sce->ics.num_swb; g++) {
+                int prevsc = sce->sf_idx[w*16+g];
+                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
+                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
+                        sce->sf_idx[w*16+g]--;
+                    else //Try to make sure there is some energy in every band
+                        sce->sf_idx[w*16+g]-=2;
+                }
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
+                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
+                if (sce->sf_idx[w*16+g] != prevsc)
+                    fflag = 1;
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+            }
+        }
+        its++;
+    } while (fflag && its < 10);
+}
+
+#endif /* AVCODEC_AACCODER_TWOLOOP_H */
diff --git a/libavcodec/aacenc_quantization.h b/libavcodec/aacenc_quantization.h
index b514954..6776dc3 100644
--- a/libavcodec/aacenc_quantization.h
+++ b/libavcodec/aacenc_quantization.h
@@ -249,6 +249,20 @@ static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
                                          cb, lambda, uplim, bits, rtz);
 }
 
+static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
+                                const float *scaled, int size, int scale_idx,
+                                int cb, const float lambda, const float uplim,
+                                int *bits, int rtz)
+{
+    int _bits;
+    quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
+                                         cb, 0.0f, uplim, &_bits, rtz);
+    if (bits) {
+        *bits = _bits;
+    }
+    return _bits;
+}
+
 static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
                                             const float *in, float *out, int size, int scale_idx,
                                             int cb, const float lambda, int rtz)
diff --git a/libavcodec/mips/aaccoder_mips.c b/libavcodec/mips/aaccoder_mips.c
index 0f12168..bc3268f 100644
--- a/libavcodec/mips/aaccoder_mips.c
+++ b/libavcodec/mips/aaccoder_mips.c
@@ -63,6 +63,7 @@
 #include "libavcodec/aacenc.h"
 #include "libavcodec/aacenctab.h"
 #include "libavcodec/aactab.h"
+#include "libavcodec/aacenctab.h"
 
 #if HAVE_INLINE_ASM
 typedef struct BandCodingPath {
@@ -199,11 +200,13 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
                                                      int *bits, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
@@ -211,6 +214,7 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
         int curidx;
         int *in_int = (int *)&in[i];
         int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -262,6 +266,14 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
         curidx += 40;
 
         put_bits(pb, p_bits[curidx], p_codes[curidx]);
+
+        if (out) {
+           vec = &p_vec[curidx*4];
+           out[i+0] = vec[0] * IQ;
+           out[i+1] = vec[1] * IQ;
+           out[i+2] = vec[2] * IQ;
+           out[i+3] = vec[3] * IQ;
+        }
     }
 }
 
@@ -272,11 +284,13 @@ static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
                                                      int *bits, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
@@ -286,6 +300,7 @@ static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4;
+        const float *vec;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -354,6 +369,14 @@ static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
         v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
         v_bits  = p_bits[curidx] + count;
         put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec = &p_vec[curidx*4];
+           out[i+0] = copysignf(vec[0] * IQ, in[i+0]);
+           out[i+1] = copysignf(vec[1] * IQ, in[i+1]);
+           out[i+2] = copysignf(vec[2] * IQ, in[i+2]);
+           out[i+3] = copysignf(vec[3] * IQ, in[i+3]);
+        }
     }
 }
 
@@ -364,11 +387,13 @@ static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
                                                      int *bits, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
@@ -378,6 +403,7 @@ static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec1, *vec2;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -433,6 +459,15 @@ static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
         v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
         v_bits  = p_bits[curidx] + p_bits[curidx2];
         put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec1 = &p_vec[curidx*2 ];
+           vec2 = &p_vec[curidx2*2];
+           out[i+0] = vec1[0] * IQ;
+           out[i+1] = vec1[1] * IQ;
+           out[i+2] = vec2[0] * IQ;
+           out[i+3] = vec2[1] * IQ;
+        }
     }
 }
 
@@ -443,20 +478,23 @@ static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
                                                       int *bits, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
 
     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
     for (i = 0; i < size; i += 4) {
-        int curidx, sign1, count1, sign2, count2;
+        int curidx1, curidx2, sign1, count1, sign2, count2;
         int *in_int = (int *)&in[i];
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -514,19 +552,28 @@ static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
               "memory"
         );
 
-        curidx  = 8 * qc1;
-        curidx += qc2;
+        curidx1  = 8 * qc1;
+        curidx1 += qc2;
 
-        v_codes = (p_codes[curidx] << count1) | sign1;
-        v_bits  = p_bits[curidx] + count1;
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
         put_bits(pb, v_bits, v_codes);
 
-        curidx  = 8 * qc3;
-        curidx += qc4;
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
 
-        v_codes = (p_codes[curidx] << count2) | sign2;
-        v_bits  = p_bits[curidx] + count2;
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
         put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec1 = &p_vec[curidx1*2];
+           vec2 = &p_vec[curidx2*2];
+           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
+           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
+           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
+           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        }
     }
 }
 
@@ -537,20 +584,23 @@ static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
                                                        int *bits, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
 
     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
     for (i = 0; i < size; i += 4) {
-        int curidx, sign1, count1, sign2, count2;
+        int curidx1, curidx2, sign1, count1, sign2, count2;
         int *in_int = (int *)&in[i];
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -607,19 +657,28 @@ static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
             : "memory"
         );
 
-        curidx  = 13 * qc1;
-        curidx += qc2;
+        curidx1  = 13 * qc1;
+        curidx1 += qc2;
 
-        v_codes = (p_codes[curidx] << count1) | sign1;
-        v_bits  = p_bits[curidx] + count1;
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
         put_bits(pb, v_bits, v_codes);
 
-        curidx  = 13 * qc3;
-        curidx += qc4;
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
 
-        v_codes = (p_codes[curidx] << count2) | sign2;
-        v_bits  = p_bits[curidx] + count2;
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
         put_bits(pb, v_bits, v_codes);
+
+        if (out) {
+           vec1 = &p_vec[curidx1*2];
+           vec2 = &p_vec[curidx2*2];
+           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
+           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
+           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
+           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        }
     }
 }
 
@@ -630,6 +689,7 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                                                    int *bits, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
 
@@ -647,6 +707,7 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
             uint8_t v_bits;
             unsigned int v_codes;
             int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
 
             qc1 = scaled[i  ] * Q34 + ROUNDING;
             qc2 = scaled[i+1] * Q34 + ROUNDING;
@@ -715,6 +776,15 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
             v_codes = (p_codes[curidx2] << count2) | sign2;
             v_bits  = p_bits[curidx2] + count2;
             put_bits(pb, v_bits, v_codes);
+
+            if (out) {
+               vec1 = &p_vectors[curidx*2 ];
+               vec2 = &p_vectors[curidx2*2];
+               out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
+               out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
+               out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
+               out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+            }
         }
     } else {
         for (i = 0; i < size; i += 4) {
@@ -724,6 +794,7 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
             unsigned int v_codes;
             int c1, c2, c3, c4;
             int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
 
             qc1 = scaled[i  ] * Q34 + ROUNDING;
             qc2 = scaled[i+1] * Q34 + ROUNDING;
@@ -825,6 +896,15 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
                 put_bits(pb, len * 2 - 3, v_codes);
             }
+
+            if (out) {
+               vec1 = &p_vectors[curidx*2];
+               vec2 = &p_vectors[curidx2*2];
+               out[i+0] = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+               out[i+1] = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+               out[i+2] = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+               out[i+3] = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+            }
         }
     }
 }
@@ -1370,7 +1450,7 @@ static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, int rtz)
 {
     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
 }
@@ -2175,195 +2255,12 @@ static float (*const get_band_cost_arr[])(struct AACEncContext *s,
 static float quantize_band_cost(struct AACEncContext *s, const float *in,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits)
+                                int *bits, int rtz)
 {
     return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
 }
 
-static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
-                                               AACEncContext *s,
-                                               SingleChannelElement *sce,
-                                               const float lambda)
-{
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128] = { 0 };
-    float maxvals[128];
-    int fflag, minscaler;
-    int its  = 0;
-    int allz = 0;
-    float minthr = INFINITY;
-
-    // for values above this the decoder might end up in an endless loop
-    // due to always having more bits than what can be encoded.
-    destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
-    //determine zero bands and upper limits
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            int nz = 0;
-            float uplim = 0.0f, energy = 0.0f;
-            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim  += band->threshold;
-                energy += band->energy;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
-                    sce->zeroes[(w+w2)*16+g] = 1;
-                    continue;
-                }
-                nz = 1;
-            }
-            uplims[w*16+g] = uplim *512;
-            sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
-            allz |= nz;
-        }
-    }
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            if (sce->zeroes[w*16+g]) {
-                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
-                continue;
-            }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
-        }
-    }
-
-    if (!allz)
-        return;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *scaled = s->scoefs + start;
-            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
-            start += sce->ics.swb_sizes[g];
-        }
-    }
-
-    //perform two-loop search
-    //outer loop - improve quality
-    do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
-        //inner loop - quantize spectrum to fit into given number of bits
-        qstep = its ? 1 : 32;
-        do {
-            int prev = -1;
-            tbits = 0;
-            fflag = 0;
-
-            if (qstep > 1) {
-                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                    start = w*128;
-                    for (g = 0;  g < sce->ics.num_swb; g++) {
-                        const float *coefs = sce->coeffs + start;
-                        const float *scaled = s->scoefs + start;
-                        int bits = 0;
-                        int cb;
-
-                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                            start += sce->ics.swb_sizes[g];
-                            continue;
-                        }
-                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                            int b;
-                            bits += quantize_band_cost_bits(s, coefs + w2*128,
-                                                            scaled + w2*128,
-                                                            sce->ics.swb_sizes[g],
-                                                            sce->sf_idx[w*16+g],
-                                                            cb,
-                                                            1.0f,
-                                                            INFINITY,
-                                                            &b);
-                        }
-                        if (prev != -1) {
-                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                        }
-                        tbits += bits;
-                        start += sce->ics.swb_sizes[g];
-                        prev = sce->sf_idx[w*16+g];
-                    }
-                }
-            }
-            else {
-                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                    start = w*128;
-                    for (g = 0;  g < sce->ics.num_swb; g++) {
-                        const float *coefs = sce->coeffs + start;
-                        const float *scaled = s->scoefs + start;
-                        int bits = 0;
-                        int cb;
-                        float dist = 0.0f;
-
-                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                            start += sce->ics.swb_sizes[g];
-                            continue;
-                        }
-                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                            int b;
-                            dist += quantize_band_cost(s, coefs + w2*128,
-                                                       scaled + w2*128,
-                                                       sce->ics.swb_sizes[g],
-                                                       sce->sf_idx[w*16+g],
-                                                       cb,
-                                                       1.0f,
-                                                       INFINITY,
-                                                       &b);
-                            bits += b;
-                        }
-                        dists[w*16+g] = dist - bits;
-                        if (prev != -1) {
-                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                        }
-                        tbits += bits;
-                        start += sce->ics.swb_sizes[g];
-                        prev = sce->sf_idx[w*16+g];
-                    }
-                }
-            }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
-            }
-            qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
-                qstep = 1;
-        } while (qstep);
-
-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
-
-        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-            for (g = 0; g < sce->ics.num_swb; g++) {
-                int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else //Try to make sure there is some energy in every band
-                        sce->sf_idx[w*16+g]-=2;
-                }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
-                if (sce->sf_idx[w*16+g] != prevsc)
-                    fflag = 1;
-                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-            }
-        }
-        its++;
-    } while (fflag && its < 10);
-}
+#include "libavcodec/aaccoder_twoloop.h"
 
 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
 {
@@ -2413,25 +2310,25 @@ static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
                                                 sce0->ics.swb_sizes[g],
                                                 sce0->sf_idx[(w+w2)*16+g],
                                                 sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL);
+                                                lambda / band0->threshold, INFINITY, NULL, 0);
                     dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
                                                 R34,
                                                 sce1->ics.swb_sizes[g],
                                                 sce1->sf_idx[(w+w2)*16+g],
                                                 sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL);
+                                                lambda / band1->threshold, INFINITY, NULL, 0);
                     dist2 += quantize_band_cost(s, M,
                                                 M34,
                                                 sce0->ics.swb_sizes[g],
                                                 sce0->sf_idx[(w+w2)*16+g],
                                                 sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL);
+                                                lambda / maxthr, INFINITY, NULL, 0);
                     dist2 += quantize_band_cost(s, S,
                                                 S34,
                                                 sce1->ics.swb_sizes[g],
                                                 sce1->sf_idx[(w+w2)*16+g],
                                                 sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL);
+                                                lambda / minthr, INFINITY, NULL, 0);
                 }
                 cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
@@ -2441,137 +2338,8 @@ static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
 }
 #endif /*HAVE_MIPSFPU */
 
-static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
-                                       int win, int group_len, const float lambda)
-{
-    BandCodingPath path[120][CB_TOT_ALL];
-    int w, swb, cb, start, size;
-    int i, j;
-    const int max_sfb  = sce->ics.max_sfb;
-    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
-    const int run_esc  = (1 << run_bits) - 1;
-    int idx, ppos, count;
-    int stackrun[120], stackcb[120], stack_len;
-    float next_minbits = INFINITY;
-    int next_mincb = 0;
-
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    start = win*128;
-    for (cb = 0; cb < CB_TOT_ALL; cb++) {
-        path[0][cb].cost     = run_bits+4;
-        path[0][cb].prev_idx = -1;
-        path[0][cb].run      = 0;
-    }
-    for (swb = 0; swb < max_sfb; swb++) {
-        size = sce->ics.swb_sizes[swb];
-        if (sce->zeroes[win*16 + swb]) {
-            float cost_stay_here = path[swb][0].cost;
-            float cost_get_here  = next_minbits + run_bits + 4;
-            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
-                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
-                cost_stay_here += run_bits;
-            if (cost_get_here < cost_stay_here) {
-                path[swb+1][0].prev_idx = next_mincb;
-                path[swb+1][0].cost     = cost_get_here;
-                path[swb+1][0].run      = 1;
-            } else {
-                path[swb+1][0].prev_idx = 0;
-                path[swb+1][0].cost     = cost_stay_here;
-                path[swb+1][0].run      = path[swb][0].run + 1;
-            }
-            next_minbits = path[swb+1][0].cost;
-            next_mincb = 0;
-            for (cb = 1; cb < CB_TOT_ALL; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-        } else {
-            float minbits = next_minbits;
-            int mincb = next_mincb;
-            int startcb = sce->band_type[win*16+swb];
-            startcb = aac_cb_in_map[startcb];
-            next_minbits = INFINITY;
-            next_mincb = 0;
-            for (cb = 0; cb < startcb; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-            for (cb = startcb; cb < CB_TOT_ALL; cb++) {
-                float cost_stay_here, cost_get_here;
-                float bits = 0.0f;
-                if (cb >= 12 && sce->band_type[win*16+swb] != aac_cb_out_map[cb]) {
-                    path[swb+1][cb].cost = 61450;
-                    path[swb+1][cb].prev_idx = -1;
-                    path[swb+1][cb].run = 0;
-                    continue;
-                }
-                for (w = 0; w < group_len; w++) {
-                    bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
-                                                    s->scoefs + start + w*128, size,
-                                                    sce->sf_idx[(win+w)*16+swb],
-                                                    aac_cb_out_map[cb],
-                                                    0, INFINITY, NULL);
-                }
-                cost_stay_here = path[swb][cb].cost + bits;
-                cost_get_here  = minbits            + bits + run_bits + 4;
-                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
-                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
-                    cost_stay_here += run_bits;
-                if (cost_get_here < cost_stay_here) {
-                    path[swb+1][cb].prev_idx = mincb;
-                    path[swb+1][cb].cost     = cost_get_here;
-                    path[swb+1][cb].run      = 1;
-                } else {
-                    path[swb+1][cb].prev_idx = cb;
-                    path[swb+1][cb].cost     = cost_stay_here;
-                    path[swb+1][cb].run      = path[swb][cb].run + 1;
-                }
-                if (path[swb+1][cb].cost < next_minbits) {
-                    next_minbits = path[swb+1][cb].cost;
-                    next_mincb = cb;
-                }
-            }
-        }
-        start += sce->ics.swb_sizes[swb];
-    }
+#include "libavcodec/aaccoder_trellis.h"
 
-    //convert resulting path from backward-linked list
-    stack_len = 0;
-    idx       = 0;
-    for (cb = 1; cb < CB_TOT_ALL; cb++)
-        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
-            idx = cb;
-    ppos = max_sfb;
-    while (ppos > 0) {
-        av_assert1(idx >= 0);
-        cb = idx;
-        stackrun[stack_len] = path[ppos][cb].run;
-        stackcb [stack_len] = cb;
-        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
-        ppos -= path[ppos][cb].run;
-        stack_len++;
-    }
-    //perform actual band info encoding
-    start = 0;
-    for (i = stack_len - 1; i >= 0; i--) {
-        cb = aac_cb_out_map[stackcb[i]];
-        put_bits(&s->pb, 4, cb);
-        count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !cb, count);
-        //XXX: memset when band_type is also uint8_t
-        for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] = cb;
-            start++;
-        }
-        while (count >= run_esc) {
-            put_bits(&s->pb, run_bits, run_esc);
-            count -= run_esc;
-        }
-        put_bits(&s->pb, run_bits, count);
-    }
-}
 #endif /* HAVE_INLINE_ASM */
 
 void ff_aac_coder_init_mips(AACEncContext *c) {
@@ -2581,11 +2349,13 @@ void ff_aac_coder_init_mips(AACEncContext *c) {
 
     if (option == 2) {
         e->quantize_and_encode_band = quantize_and_encode_band_mips;
-        e->encode_window_bands_info = codebook_trellis_rate_mips;
+        e->encode_window_bands_info = codebook_trellis_rate;
 #if HAVE_MIPSFPU
-        e->search_for_quantizers    = search_for_quantizers_twoloop_mips;
-        e->search_for_ms            = search_for_ms_mips;
+        e->search_for_quantizers    = search_for_quantizers_twoloop;
 #endif /* HAVE_MIPSFPU */
     }
+#if HAVE_MIPSFPU
+    e->search_for_ms            = search_for_ms_mips;
+#endif /* HAVE_MIPSFPU */
 #endif /* HAVE_INLINE_ASM */
 }
diff --git a/tests/fate/aac.mak b/tests/fate/aac.mak
index cbefc38..72fc59d 100644
--- a/tests/fate/aac.mak
+++ b/tests/fate/aac.mak
@@ -158,6 +158,15 @@ fate-aac-ln-encode: CMP_SHIFT = -4096
 fate-aac-ln-encode: CMP_TARGET = 68
 fate-aac-ln-encode: SIZE_TOLERANCE = 3560
 
+FATE_AAC_ENCODE += fate-aac-ln-encode-128k
+fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-ln-encode-128k: CMP = stddev
+fate-aac-ln-encode-128k: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
+fate-aac-ln-encode-128k: CMP_SHIFT = -4096
+fate-aac-ln-encode-128k: CMP_TARGET = 638
+fate-aac-ln-encode-128k: SIZE_TOLERANCE = 3560
+fate-aac-ln-encode-128k: FUZZ = 5
+
 FATE_AAC_ENCODE += fate-aac-pns-encode
 fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k
 fate-aac-pns-encode: CMP = stddev
@@ -165,7 +174,7 @@ fate-aac-pns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.w
 fate-aac-pns-encode: CMP_SHIFT = -4096
 fate-aac-pns-encode: CMP_TARGET = 633.77
 fate-aac-pns-encode: SIZE_TOLERANCE = 3560
-fate-aac-pns-encode: FUZZ = 5
+fate-aac-pns-encode: FUZZ = 1
 
 FATE_AAC_ENCODE += fate-aac-tns-encode
 fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k
@@ -183,7 +192,7 @@ fate-aac-is-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wa
 fate-aac-is-encode: CMP_SHIFT = -4096
 fate-aac-is-encode: CMP_TARGET = 616.75
 fate-aac-is-encode: SIZE_TOLERANCE = 3560
-fate-aac-is-encode: FUZZ = 35
+fate-aac-is-encode: FUZZ = 1
 
 FATE_AAC_ENCODE += fate-aac-pred-encode
 fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
@@ -191,7 +200,7 @@ fate-aac-pred-encode: CMP = stddev
 fate-aac-pred-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pred-encode: CMP_SHIFT = -4096
 fate-aac-pred-encode: CMP_TARGET = 652.60
-fate-aac-pred-encode: FUZZ = 10
+fate-aac-pred-encode: FUZZ = 5
 fate-aac-pred-encode: SIZE_TOLERANCE = 3560
 
 FATE_AAC_LATM += fate-aac-latm_000000001180bc60