[FFmpeg-devel] [PATCH] Optimization of AMR NB and WB decoders for MIPS

Thu May 24 22:09:51 CEST 2012

Hello again,

On 05/18/2012 03:47 PM, Nedeljko Babic wrote:
> AMR NB and WB decoders are optimized for MIPS architecture.
> Appropriate Makefiles are changed accordingly.

I've given a second look and I have a few more comments about the ASM 
code (I will look tomorrow at the files you just sent).

> +av_always_inline void ff_acelp_apply_order_2_transfer_function(float *out, const float *in,
> +                                              const float zero_coeffs[2],
> +                                              const float pole_coeffs[2],
> +                                              float gain, float mem[2], int n)
> +{
> +    /**
> +    * loop is unrolled eight times
> +    */
> +
> +    __asm__ __volatile__ (
> +        "lwc1   $f0,    0(%[mem])                                              \n\t"
> +        "blez   %[n],   ff_acelp_apply_order_2_transfer_function_end%=         \n\t"
> +        "lwc1   $f1,    4(%[mem])                                              \n\t"
> +        "lwc1   $f2,    0(%[pole_coeffs])                                      \n\t"
> +        "lwc1   $f3,    4(%[pole_coeffs])                                      \n\t"
> +        "lwc1   $f4,    0(%[zero_coeffs])                                      \n\t"
> +        "lwc1   $f5,    4(%[zero_coeffs])                                      \n\t"
> +
> +        "ff_acelp_apply_order_2_transfer_function_madd%=:                      \n\t"
> +
> +        "lwc1   $f6,    0(%[in])                                               \n\t"

> +        "mul.s  $f9,    $f3,      $f1                                          \n\t"
> +        "mul.s  $f7,    $f2,      $f0                                          \n\t"
> +        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
> +        "sub.s  $f7,    $f7,      $f9                                          \n\t"

Why not use "msub.s $f7,    $f7,      %f3, $f1"? Looking at the C 
source, it looks like it could be done just with muls/msub/madd, with no 
adds or subs.

> +        "madd.s $f8,    $f7,      $f4,     $f0                                 \n\t"
> +        "madd.s $f8,    $f8,      $f5,     $f1                                 \n\t"
> +        "lwc1   $f11,   4(%[in])                                               \n\t"
> +        "mul.s  $f12,   $f3,      $f0                                          \n\t"
> +        "mul.s  $f13,   $f2,      $f7                                          \n\t"
> +        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
> +        "sub.s  $f13,   $f13,     $f12                                         \n\t"
> +        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
> +        "madd.s $f14,   $f14,     $f5,     $f0                                 \n\t"
> +        "swc1   $f8,    0(%[out])                                              \n\t"
> +        "lwc1   $f6,    8(%[in])                                               \n\t"
> +        "mul.s  $f9,    $f3,      $f7                                          \n\t"
> +        "mul.s  $f15,   $f2,      $f13                                         \n\t"
> +        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
> +        "sub.s  $f15,   $f15,     $f9                                          \n\t"
> +        "madd.s $f8,    $f15,     $f4,     $f13                                \n\t"
> +        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
> +        "swc1   $f14,   4(%[out])                                              \n\t"
> +        "lwc1   $f11,   12(%[in])                                              \n\t"
> +        "mul.s  $f12,   $f3,      $f13                                         \n\t"
> +        "mul.s  $f16,   $f2,      $f15                                         \n\t"
> +        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
> +        "sub.s  $f16,   $f16,     $f12                                         \n\t"
> +        "madd.s $f14,   $f16,     $f4,     $f15                                \n\t"
> +        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
> +        "swc1   $f8,    8(%[out])                                              \n\t"
> +        "lwc1   $f6,    16(%[in])                                              \n\t"
> +        "mul.s  $f9,    $f3,      $f15                                         \n\t"
> +        "mul.s  $f7,    $f2,      $f16                                         \n\t"
> +        "msub.s $f7,    $f7,      %[gain], $f6                                 \n\t"
> +        "sub.s  $f7,    $f7,      $f9                                          \n\t"
> +        "madd.s $f8,    $f7,      $f4,     $f16                                \n\t"
> +        "madd.s $f8,    $f8,      $f5,     $f15                                \n\t"
> +        "swc1   $f14,   12(%[out])                                             \n\t"
> +        "lwc1   $f11,   20(%[in])                                              \n\t"
> +        "mul.s  $f12,   $f3,      $f16                                         \n\t"
> +        "mul.s  $f13,   $f2,      $f7                                          \n\t"
> +        "msub.s $f13,   $f13,     %[gain], $f11                                \n\t"
> +        "sub.s  $f13,   $f13,     $f12                                         \n\t"
> +        "madd.s $f14,   $f13,     $f4,     $f7                                 \n\t"
> +        "madd.s $f14,   $f14,     $f5,     $f16                                \n\t"
> +        "swc1   $f8,    16(%[out])                                             \n\t"
> +        "lwc1   $f6,    24(%[in])                                              \n\t"
> +        "mul.s  $f9,    $f3,      $f7                                          \n\t"
> +        "mul.s  $f15,   $f2,      $f13                                         \n\t"
> +        "msub.s $f15,   $f15,     %[gain], $f6                                 \n\t"
> +        "sub.s  $f15,   $f15,     $f9                                          \n\t"
> +        "madd.s $f8,    $f15,     $f4,     $f13                                \n\t"
> +        "madd.s $f8,    $f8,      $f5,     $f7                                 \n\t"
> +        "swc1   $f14,   20(%[out])                                             \n\t"
> +        "lwc1   $f11,   28(%[in])                                              \n\t"
> +        "mul.s  $f12,   $f3,      $f13                                         \n\t"
> +        "mul.s  $f16,   $f2,      $f15                                         \n\t"
> +        "msub.s $f16,   $f16,     %[gain], $f11                                \n\t"
> +        "sub.s  $f16,   $f16,     $f12                                         \n\t"
> +        "madd.s $f14,   $f16,     $f4,     $f15                                \n\t"
> +        "madd.s $f14,   $f14,     $f5,     $f13                                \n\t"
> +        "swc1   $f8,    24(%[out])                                             \n\t"
> +        "addiu  %[out], 32                                                     \n\t"
> +        "addiu  %[in],  32                                                     \n\t"
> +        "addiu  %[n],   -8                                                     \n\t"
> +        "swc1   $f15,   4(%[mem])                                              \n\t"

> +        "mov.s  $f1,    $f15                                                   \n\t"
> +        "mov.s  $f0,    $f16                                                   \n\t"

Can't you avoid this two movs by using $f1 and $f0 as destination 
registers in previous instructions?

> +        "swc1   $f16,   0(%[mem])                                              \n\t"
> +        "swc1   $f14,   -4(%[out])                                             \n\t"

I think you can update mem outsize the main loop.

> +        "bnez   %[n],   ff_acelp_apply_order_2_transfer_function_madd%=        \n\t"
> +
> +        "ff_acelp_apply_order_2_transfer_function_end%=:                       \n\t"
> +
> +         : [out] "+r" (out),
> +           [in] "+r" (in), [gain] "+f" (gain),
> +           [n] "+r" (n), [mem] "+r" (mem)
> +         : [zero_coeffs] "r" (zero_coeffs),
> +           [pole_coeffs] "r" (pole_coeffs)
> +         : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
> +           "$f6", "$f7",  "$f8", "$f9", "$f10", "$f11",
> +           "$f12", "$f13", "$f14", "$f15", "$f16"
> +    );
> +}

> --- /dev/null
> +++ b/libavcodec/mips/amrwb_lsp2lpc.h
> @@ -0,0 +1,97 @@
> +#ifndef AVCODEC_AMRWB_LSP2LPC_H
> +#define AVCODEC_AMRWB_LSP2LPC_H
> +av_always_inline void ff_amrwb_lsp2lpc(const double *lsp, float *lp, int lp_order)
> +{
> +    int lp_half_order = lp_order>>  1;
> +    double buf[lp_half_order + 1];
> +    double pa[lp_half_order + 1];
> +    double *qa = buf + 1;
> +    double lsp_lp_o = lsp[lp_order - 1];
> +    int i,j;
> +    double paf, qaf;
> +
> +    qa[-1] = 0.0;
> +
> +    ff_lsp2polyf(lsp    , pa, lp_half_order    );
> +    ff_lsp2polyf(lsp + 1, qa, lp_half_order - 1);
> +
> +    for (i = 1, j = lp_order - 1; i<  lp_half_order; i++, j--) {
> +        paf =  pa[i];
> +        qaf = (qa[i] - qa[i-2]) * (1 - lsp_lp_o);
> +
> +        __asm__ __volatile__ (
> +            "madd.d %[paf], %[paf], %[paf], %[lsp_lp_o] \n\t"
> +
> +            : [paf]"+f"(paf)
> +            : [lsp_lp_o]"f"(lsp_lp_o)
> +        );
> +        lp[i-1]  = (paf + qaf) * 0.5;
> +        lp[j-1]  = (paf - qaf) * 0.5;
> +    }
> +
> +    paf = pa[lp_half_order] * 0.5;
> +
> +    __asm__ __volatile__ (
> +        "madd.d %[paf], %[paf], %[paf], %[lsp_lp_o]     \n\t"
> +
> +        : [paf]"+f"(paf)
> +        : [lsp_lp_o]"f"(lsp_lp_o)
> +    );
> +
> +    lp[lp_half_order - 1] = paf;
> +
> +    lp[lp_order - 1] = lsp_lp_o;
> +}
> +#endif /* AVCODEC_AMRWB_LSP2LPC_H */

I dont think you should have much gain from putting this function in a 
header as inline. It should be costly enough so that the call cost is 
unimportant...

> diff --git a/libavcodec/mips/amrwbdec_mips.h b/libavcodec/mips/amrwbdec_mips.h
> new file mode 100644
> index 0000000..e715df1
> --- /dev/null
> +++ b/libavcodec/mips/amrwbdec_mips.h
> @@ -0,0 +1,186 @@
> +/*
> + * Copyright (c) 2012
> + *      MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author:  Nedeljko Babic (nbabic at mips.com)
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Reference: libavcodec/amrwbdec.c
> + */
> +#ifndef AVCODEC_AMRWBDEC_MIPS_H
> +#define AVCODEC_AMRWBDEC_MIPS_H
> +
> +static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
> +                          float mem[HB_FIR_SIZE], const float *in)
> +{

Same thing for this function.

> +    int i;
> +    float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples
> +
> +    memcpy(data, mem, HB_FIR_SIZE * sizeof(float));
> +    memcpy(data + HB_FIR_SIZE, in, AMRWB_SFR_SIZE_16k * sizeof(float));
> +
> +    for (i = 0; i<  AMRWB_SFR_SIZE_16k; i++) {
> +        float output;
> +        float * p_data = (data+i);
> +
> +        /**
> +        * inner loop is entirely unrolled and instructions are scheduled
> +        * to minimize pipeline stall
> +        */
> +        __asm__ __volatile__(
> +            "mtc1       $zero,     %[output]                      \n\t"
> +            "lwc1       $f0,       0(%[p_data])                   \n\t"
> +            "lwc1       $f1,       0(%[fir_coef])                 \n\t"
> +            "lwc1       $f2,       4(%[p_data])                   \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f3,       4(%[fir_coef])                 \n\t"
> +            "lwc1       $f4,       8(%[p_data])                   \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +            "lwc1       $f5,       8(%[fir_coef])                 \n\t"
> +
> +            "lwc1       $f0,       12(%[p_data])                  \n\t"
> +            "lwc1       $f1,       12(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f2,       16(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f3,       16(%[fir_coef])                \n\t"
> +            "lwc1       $f4,       20(%[p_data])                  \n\t"
> +            "lwc1       $f5,       20(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +
> +            "lwc1       $f0,       24(%[p_data])                  \n\t"
> +            "lwc1       $f1,       24(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       28(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f3,       28(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f4,       32(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +            "lwc1       $f5,       32(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +
> +            "lwc1       $f0,       36(%[p_data])                  \n\t"
> +            "lwc1       $f1,       36(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       40(%[p_data])                  \n\t"
> +            "lwc1       $f3,       40(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f4,       44(%[p_data])                  \n\t"
> +            "lwc1       $f5,       44(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +
> +            "lwc1       $f0,       48(%[p_data])                  \n\t"
> +            "lwc1       $f1,       48(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       52(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f3,       52(%[fir_coef])                \n\t"
> +            "lwc1       $f4,       56(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f5,       56(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +
> +            "lwc1       $f0,       60(%[p_data])                  \n\t"
> +            "lwc1       $f1,       60(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       64(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f3,       64(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f4,       68(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +            "lwc1       $f5,       68(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +
> +            "lwc1       $f0,       72(%[p_data])                  \n\t"
> +            "lwc1       $f1,       72(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       76(%[p_data])                  \n\t"
> +            "lwc1       $f3,       76(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f4,       80(%[p_data])                  \n\t"
> +            "lwc1       $f5,       80(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +
> +            "lwc1       $f0,       84(%[p_data])                  \n\t"
> +            "lwc1       $f1,       84(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       88(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f3,       88(%[fir_coef])                \n\t"
> +            "lwc1       $f4,       92(%[p_data])                  \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f5,       92(%[fir_coef])                \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +
> +            "lwc1       $f0,       96(%[p_data])                  \n\t"
> +            "lwc1       $f1,       96(%[fir_coef])                \n\t"
> +            "lwc1       $f2,       100(%[p_data])                 \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f3,       100(%[fir_coef])               \n\t"
> +            "lwc1       $f4,       104(%[p_data])                 \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f5,       104(%[fir_coef])               \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +
> +            "lwc1       $f0,       108(%[p_data])                 \n\t"
> +            "lwc1       $f1,       108(%[fir_coef])               \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "lwc1       $f2,       112(%[p_data])                 \n\t"
> +            "lwc1       $f3,       112(%[fir_coef])               \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +            "lwc1       $f4,       116(%[p_data])                 \n\t"
> +            "lwc1       $f5,       116(%[fir_coef])               \n\t"
> +            "lwc1       $f0,       120(%[p_data])                 \n\t"
> +            "madd.s     %[output], %[output],       $f2, $f3      \n\t"
> +            "lwc1       $f1,       120(%[fir_coef])               \n\t"
> +            "madd.s     %[output], %[output],       $f4, $f5      \n\t"
> +            "madd.s     %[output], %[output],       $f0, $f1      \n\t"
> +
> +            : [output]"+f"(output)
> +            : [fir_coef]"r"(fir_coef), [p_data]"r"(p_data)
> +            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5"
> +        );
> +        out[i] = output;
> +    }
> +    memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
> +}
> +

> +av_always_inline void ff_celp_lp_zero_synthesis_filterf(float *out,
> +                                       const float *filter_coeffs,
> +                                       const float *in, int buffer_length,
> +                                       int filter_length)
> +{
> +    int i,n;
> +    float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val;
> +    float sum_out3, sum_out2, sum_out1;
> +    const float *p_filter_coeffs, *p_in;
> +
> +    for (n = 0; n<  buffer_length; n+=8) {
> +        p_in =&in[n];
> +        p_filter_coeffs = filter_coeffs;
> +        sum_out8 = in[n+7];
> +        sum_out7 = in[n+6];
> +        sum_out6 = in[n+5];
> +        sum_out5 = in[n+4];
> +        sum_out4 = in[n+3];
> +        sum_out3 = in[n+2];
> +        sum_out2 = in[n+1];
> +        sum_out1 = in[n];
> +        i = filter_length;
> +
> +        /* i is always greater than 0
> +        * inner loop is unrolled eight times so there is less memory access
> +        */
> +        __asm__ __volatile__(
> +            "filt_lp_inner%=:                                               \n\t"
> +            "lwc1   %[fc_val],   0(%[p_filter_coeffs])                      \n\t"
> +            "lwc1   $f7,         6*4(%[p_in])                               \n\t"
> +            "lwc1   $f6,         5*4(%[p_in])                               \n\t"
> +            "lwc1   $f5,         4*4(%[p_in])                               \n\t"
> +            "lwc1   $f4,         3*4(%[p_in])                               \n\t"
> +            "lwc1   $f3,         2*4(%[p_in])                               \n\t"
> +            "lwc1   $f2,         4(%[p_in])                                 \n\t"
> +            "lwc1   $f1,         0(%[p_in])                                 \n\t"
> +            "lwc1   $f0,         -4(%[p_in])                                \n\t"
> +            "addiu  %[i],        -2                                         \n\t"
> +            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f7       \n\t"
> +            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f6       \n\t"
> +            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f5       \n\t"
> +            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f4       \n\t"
> +            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f3       \n\t"
> +            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f2       \n\t"
> +            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f1       \n\t"
> +            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f0       \n\t"
> +            "lwc1   %[fc_val],   4(%[p_filter_coeffs])                      \n\t"

> +            "mov.s  $f7,         $f6                                        \n\t"
> +            "mov.s  $f6,         $f5                                        \n\t"
> +            "mov.s  $f5,         $f4                                        \n\t"
> +            "mov.s  $f4,         $f3                                        \n\t"
> +            "mov.s  $f3,         $f2                                        \n\t"
> +            "mov.s  $f2,         $f1                                        \n\t"
> +            "mov.s  $f1,         $f0                                        \n\t"

If you unroll once the inner loop you will need to do all these movs 
only once per iteration (thus half of the time).

> +            "lwc1   $f0,         -8(%[p_in])                                \n\t"
> +            "addiu  %[p_filter_coeffs], 8                                   \n\t"
> +            "addiu  %[p_in],     -8                                         \n\t"
> +            "madd.s %[sum_out8], %[sum_out8],          %[fc_val], $f7       \n\t"
> +            "madd.s %[sum_out7], %[sum_out7],          %[fc_val], $f6       \n\t"
> +            "madd.s %[sum_out6], %[sum_out6],          %[fc_val], $f5       \n\t"
> +            "madd.s %[sum_out5], %[sum_out5],          %[fc_val], $f4       \n\t"
> +            "madd.s %[sum_out4], %[sum_out4],          %[fc_val], $f3       \n\t"
> +            "madd.s %[sum_out3], %[sum_out3],          %[fc_val], $f2       \n\t"
> +            "madd.s %[sum_out2], %[sum_out2],          %[fc_val], $f1       \n\t"
> +            "madd.s %[sum_out1], %[sum_out1],          %[fc_val], $f0       \n\t"
> +            "bgtz   %[i],        filt_lp_inner%=                            \n\t"
> +
> +            : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7),
> +              [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5),
> +              [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3),
> +              [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1),
> +              [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs),
> +              [p_in]"+r"(p_in), [i]"+r"(i)
> +            :
> +            : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7"
> +        );
> +
> +        out[n+7] = sum_out8;
> +        out[n+6] = sum_out7;
> +        out[n+5] = sum_out6;
> +        out[n+4] = sum_out5;
> +        out[n+3] = sum_out4;
> +        out[n+2] = sum_out3;
> +        out[n+1] = sum_out2;
> +        out[n] = sum_out1;
> +    }
> +}
> +#endif /* AVCODEC_CELP_MATH_MIPS_H */
> diff --git a/libavcodec/mips/lsp_mips.h b/libavcodec/mips/lsp_mips.h
> new file mode 100644
> index 0000000..c91ccee
> --- /dev/null
> +++ b/libavcodec/mips/lsp_mips.h
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2012
> + *      MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author:  Nedeljko Babic (nbabic at mips.com)
> + *
> + * LSP routines for ACELP-based codecs optimized for MIPS
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Reference: libavcodec/lsp.c
> + */
> +#ifndef AVCODEC_LSP_MIPS_H
> +#define AVCODEC_LSP_MIPS_H
> +
> +static av_always_inline void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
> +{
> +    int i, j = 0;
> +    double * p_fi = f;
> +    double * p_f = 0;
> +
> +    f[0] = 1.0;
> +    f[1] = -2 * lsp[0];
> +    lsp -= 2;
> +
> +    for(i=2; i<=lp_half_order; i++)
> +    {
> +        double tmp, f_j_2, f_j_1, f_j;
> +        double val = lsp[2*i];
> +
> +        __asm__ __volatile__(
> +            "move   %[p_f],     %[p_fi]                         \n\t"
> +            "add.d  %[val],     %[val],     %[val]              \n\t"
> +            "addiu  %[p_fi],    8                               \n\t"
> +            "ldc1   %[f_j_2],   0(%[p_f])                       \n\t"
> +            "ldc1   %[f_j_1],   8(%[p_f])                       \n\t"

> +            "neg.d  %[val],     %[val]                          \n\t"

You can avoid this neg by replacing val by -val everywhere (madd -> 
msub, etc).

> +            "add.d  %[tmp],     %[f_j_2],   %[f_j_2]            \n\t"
> +            "madd.d %[tmp],     %[tmp],     %[f_j_1], %[val]    \n\t"
> +            "mov.d  %[f_j],     %[f_j_1]                        \n\t"
> +            "addiu  %[j],       %[i], -2                        \n\t"
> +            "mov.d  %[f_j_1],   %[f_j_2]                        \n\t"

You can probably avoid the two mov.d by renaming the vars in the 
previous instructions.

> +            "ldc1   %[f_j_2],   -8(%[p_f])                      \n\t"
> +            "sdc1   %[tmp],     16(%[p_f])                      \n\t"
> +            "beqz   %[j],       ff_lsp2polyf_lp_j_end%=         \n\t"
> +            "ff_lsp2polyf_lp_j%=:                               \n\t"
> +            "add.d  %[tmp],     %[f_j],     %[f_j_2]            \n\t"
> +            "madd.d %[tmp],     %[tmp],     %[f_j_1], %[val]    \n\t"
> +            "mov.d  %[f_j],     %[f_j_1]                        \n\t"
> +            "addiu  %[j],       -1                              \n\t"
> +            "mov.d  %[f_j_1],   %[f_j_2]                        \n\t"
> +            "ldc1   %[f_j_2],   -16(%[p_f])                     \n\t"
> +            "sdc1   %[tmp],     8(%[p_f])                       \n\t"
> +            "addiu  %[p_f],     -8                              \n\t"
> +            "bgtz   %[j],       ff_lsp2polyf_lp_j%=             \n\t"
> +            "ff_lsp2polyf_lp_j_end%=:                           \n\t"
> +
> +            : [f_j_2]"=&f"(f_j_2), [f_j_1]"=&f"(f_j_1), [val]"+f"(val),
> +              [tmp]"=&f"(tmp), [f_j]"=&f"(f_j), [p_f]"+r"(p_f),
> +              [j]"+r"(j), [p_fi]"+r"(p_fi)
> +            : [i]"r"(i)
> +        );
> +        f[1] += val;
> +    }
> +
> +}

-Vitor