[FFmpeg-devel] [PATCH] Optimization of AMR NB and WB decoders for MIPS
Vitor Sessak
vitor1001 at gmail.com
Thu May 24 22:09:51 CEST 2012
Hello again,
On 05/18/2012 03:47 PM, Nedeljko Babic wrote:
> AMR NB and WB decoders are optimized for MIPS architecture.
> Appropriate Makefiles are changed accordingly.
I've given a second look and I have a few more comments about the ASM
code (I will look tomorrow at the files you just sent).
> +av_always_inline void ff_acelp_apply_order_2_transfer_function(float *out, const float *in,
> + const float zero_coeffs[2],
> + const float pole_coeffs[2],
> + float gain, float mem[2], int n)
> +{
> + /**
> + * loop is unrolled eight times
> + */
> +
> + __asm__ __volatile__ (
> + "lwc1 $f0, 0(%[mem]) \n\t"
> + "blez %[n], ff_acelp_apply_order_2_transfer_function_end%= \n\t"
> + "lwc1 $f1, 4(%[mem]) \n\t"
> + "lwc1 $f2, 0(%[pole_coeffs]) \n\t"
> + "lwc1 $f3, 4(%[pole_coeffs]) \n\t"
> + "lwc1 $f4, 0(%[zero_coeffs]) \n\t"
> + "lwc1 $f5, 4(%[zero_coeffs]) \n\t"
> +
> + "ff_acelp_apply_order_2_transfer_function_madd%=: \n\t"
> +
> + "lwc1 $f6, 0(%[in]) \n\t"
> + "mul.s $f9, $f3, $f1 \n\t"
> + "mul.s $f7, $f2, $f0 \n\t"
> + "msub.s $f7, $f7, %[gain], $f6 \n\t"
> + "sub.s $f7, $f7, $f9 \n\t"
Why not use "msub.s $f7, $f7, %f3, $f1"? Looking at the C
source, it looks like it could be done just with muls/msub/madd, with no
adds or subs.
> + "madd.s $f8, $f7, $f4, $f0 \n\t"
> + "madd.s $f8, $f8, $f5, $f1 \n\t"
> + "lwc1 $f11, 4(%[in]) \n\t"
> + "mul.s $f12, $f3, $f0 \n\t"
> + "mul.s $f13, $f2, $f7 \n\t"
> + "msub.s $f13, $f13, %[gain], $f11 \n\t"
> + "sub.s $f13, $f13, $f12 \n\t"
> + "madd.s $f14, $f13, $f4, $f7 \n\t"
> + "madd.s $f14, $f14, $f5, $f0 \n\t"
> + "swc1 $f8, 0(%[out]) \n\t"
> + "lwc1 $f6, 8(%[in]) \n\t"
> + "mul.s $f9, $f3, $f7 \n\t"
> + "mul.s $f15, $f2, $f13 \n\t"
> + "msub.s $f15, $f15, %[gain], $f6 \n\t"
> + "sub.s $f15, $f15, $f9 \n\t"
> + "madd.s $f8, $f15, $f4, $f13 \n\t"
> + "madd.s $f8, $f8, $f5, $f7 \n\t"
> + "swc1 $f14, 4(%[out]) \n\t"
> + "lwc1 $f11, 12(%[in]) \n\t"
> + "mul.s $f12, $f3, $f13 \n\t"
> + "mul.s $f16, $f2, $f15 \n\t"
> + "msub.s $f16, $f16, %[gain], $f11 \n\t"
> + "sub.s $f16, $f16, $f12 \n\t"
> + "madd.s $f14, $f16, $f4, $f15 \n\t"
> + "madd.s $f14, $f14, $f5, $f13 \n\t"
> + "swc1 $f8, 8(%[out]) \n\t"
> + "lwc1 $f6, 16(%[in]) \n\t"
> + "mul.s $f9, $f3, $f15 \n\t"
> + "mul.s $f7, $f2, $f16 \n\t"
> + "msub.s $f7, $f7, %[gain], $f6 \n\t"
> + "sub.s $f7, $f7, $f9 \n\t"
> + "madd.s $f8, $f7, $f4, $f16 \n\t"
> + "madd.s $f8, $f8, $f5, $f15 \n\t"
> + "swc1 $f14, 12(%[out]) \n\t"
> + "lwc1 $f11, 20(%[in]) \n\t"
> + "mul.s $f12, $f3, $f16 \n\t"
> + "mul.s $f13, $f2, $f7 \n\t"
> + "msub.s $f13, $f13, %[gain], $f11 \n\t"
> + "sub.s $f13, $f13, $f12 \n\t"
> + "madd.s $f14, $f13, $f4, $f7 \n\t"
> + "madd.s $f14, $f14, $f5, $f16 \n\t"
> + "swc1 $f8, 16(%[out]) \n\t"
> + "lwc1 $f6, 24(%[in]) \n\t"
> + "mul.s $f9, $f3, $f7 \n\t"
> + "mul.s $f15, $f2, $f13 \n\t"
> + "msub.s $f15, $f15, %[gain], $f6 \n\t"
> + "sub.s $f15, $f15, $f9 \n\t"
> + "madd.s $f8, $f15, $f4, $f13 \n\t"
> + "madd.s $f8, $f8, $f5, $f7 \n\t"
> + "swc1 $f14, 20(%[out]) \n\t"
> + "lwc1 $f11, 28(%[in]) \n\t"
> + "mul.s $f12, $f3, $f13 \n\t"
> + "mul.s $f16, $f2, $f15 \n\t"
> + "msub.s $f16, $f16, %[gain], $f11 \n\t"
> + "sub.s $f16, $f16, $f12 \n\t"
> + "madd.s $f14, $f16, $f4, $f15 \n\t"
> + "madd.s $f14, $f14, $f5, $f13 \n\t"
> + "swc1 $f8, 24(%[out]) \n\t"
> + "addiu %[out], 32 \n\t"
> + "addiu %[in], 32 \n\t"
> + "addiu %[n], -8 \n\t"
> + "swc1 $f15, 4(%[mem]) \n\t"
> + "mov.s $f1, $f15 \n\t"
> + "mov.s $f0, $f16 \n\t"
Can't you avoid this two movs by using $f1 and $f0 as destination
registers in previous instructions?
> + "swc1 $f16, 0(%[mem]) \n\t"
> + "swc1 $f14, -4(%[out]) \n\t"
I think you can update mem outsize the main loop.
> + "bnez %[n], ff_acelp_apply_order_2_transfer_function_madd%= \n\t"
> +
> + "ff_acelp_apply_order_2_transfer_function_end%=: \n\t"
> +
> + : [out] "+r" (out),
> + [in] "+r" (in), [gain] "+f" (gain),
> + [n] "+r" (n), [mem] "+r" (mem)
> + : [zero_coeffs] "r" (zero_coeffs),
> + [pole_coeffs] "r" (pole_coeffs)
> + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5",
> + "$f6", "$f7", "$f8", "$f9", "$f10", "$f11",
> + "$f12", "$f13", "$f14", "$f15", "$f16"
> + );
> +}
> --- /dev/null
> +++ b/libavcodec/mips/amrwb_lsp2lpc.h
> @@ -0,0 +1,97 @@
> +#ifndef AVCODEC_AMRWB_LSP2LPC_H
> +#define AVCODEC_AMRWB_LSP2LPC_H
> +av_always_inline void ff_amrwb_lsp2lpc(const double *lsp, float *lp, int lp_order)
> +{
> + int lp_half_order = lp_order>> 1;
> + double buf[lp_half_order + 1];
> + double pa[lp_half_order + 1];
> + double *qa = buf + 1;
> + double lsp_lp_o = lsp[lp_order - 1];
> + int i,j;
> + double paf, qaf;
> +
> + qa[-1] = 0.0;
> +
> + ff_lsp2polyf(lsp , pa, lp_half_order );
> + ff_lsp2polyf(lsp + 1, qa, lp_half_order - 1);
> +
> + for (i = 1, j = lp_order - 1; i< lp_half_order; i++, j--) {
> + paf = pa[i];
> + qaf = (qa[i] - qa[i-2]) * (1 - lsp_lp_o);
> +
> + __asm__ __volatile__ (
> + "madd.d %[paf], %[paf], %[paf], %[lsp_lp_o] \n\t"
> +
> + : [paf]"+f"(paf)
> + : [lsp_lp_o]"f"(lsp_lp_o)
> + );
> + lp[i-1] = (paf + qaf) * 0.5;
> + lp[j-1] = (paf - qaf) * 0.5;
> + }
> +
> + paf = pa[lp_half_order] * 0.5;
> +
> + __asm__ __volatile__ (
> + "madd.d %[paf], %[paf], %[paf], %[lsp_lp_o] \n\t"
> +
> + : [paf]"+f"(paf)
> + : [lsp_lp_o]"f"(lsp_lp_o)
> + );
> +
> + lp[lp_half_order - 1] = paf;
> +
> + lp[lp_order - 1] = lsp_lp_o;
> +}
> +#endif /* AVCODEC_AMRWB_LSP2LPC_H */
I dont think you should have much gain from putting this function in a
header as inline. It should be costly enough so that the call cost is
unimportant...
> diff --git a/libavcodec/mips/amrwbdec_mips.h b/libavcodec/mips/amrwbdec_mips.h
> new file mode 100644
> index 0000000..e715df1
> --- /dev/null
> +++ b/libavcodec/mips/amrwbdec_mips.h
> @@ -0,0 +1,186 @@
> +/*
> + * Copyright (c) 2012
> + * MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + * contributors may be used to endorse or promote products derived from
> + * this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author: Nedeljko Babic (nbabic at mips.com)
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Reference: libavcodec/amrwbdec.c
> + */
> +#ifndef AVCODEC_AMRWBDEC_MIPS_H
> +#define AVCODEC_AMRWBDEC_MIPS_H
> +
> +static void hb_fir_filter(float *out, const float fir_coef[HB_FIR_SIZE + 1],
> + float mem[HB_FIR_SIZE], const float *in)
> +{
Same thing for this function.
> + int i;
> + float data[AMRWB_SFR_SIZE_16k + HB_FIR_SIZE]; // past and current samples
> +
> + memcpy(data, mem, HB_FIR_SIZE * sizeof(float));
> + memcpy(data + HB_FIR_SIZE, in, AMRWB_SFR_SIZE_16k * sizeof(float));
> +
> + for (i = 0; i< AMRWB_SFR_SIZE_16k; i++) {
> + float output;
> + float * p_data = (data+i);
> +
> + /**
> + * inner loop is entirely unrolled and instructions are scheduled
> + * to minimize pipeline stall
> + */
> + __asm__ __volatile__(
> + "mtc1 $zero, %[output] \n\t"
> + "lwc1 $f0, 0(%[p_data]) \n\t"
> + "lwc1 $f1, 0(%[fir_coef]) \n\t"
> + "lwc1 $f2, 4(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f3, 4(%[fir_coef]) \n\t"
> + "lwc1 $f4, 8(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> + "lwc1 $f5, 8(%[fir_coef]) \n\t"
> +
> + "lwc1 $f0, 12(%[p_data]) \n\t"
> + "lwc1 $f1, 12(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f2, 16(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f3, 16(%[fir_coef]) \n\t"
> + "lwc1 $f4, 20(%[p_data]) \n\t"
> + "lwc1 $f5, 20(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> +
> + "lwc1 $f0, 24(%[p_data]) \n\t"
> + "lwc1 $f1, 24(%[fir_coef]) \n\t"
> + "lwc1 $f2, 28(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f3, 28(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f4, 32(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> + "lwc1 $f5, 32(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> +
> + "lwc1 $f0, 36(%[p_data]) \n\t"
> + "lwc1 $f1, 36(%[fir_coef]) \n\t"
> + "lwc1 $f2, 40(%[p_data]) \n\t"
> + "lwc1 $f3, 40(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f4, 44(%[p_data]) \n\t"
> + "lwc1 $f5, 44(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> +
> + "lwc1 $f0, 48(%[p_data]) \n\t"
> + "lwc1 $f1, 48(%[fir_coef]) \n\t"
> + "lwc1 $f2, 52(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f3, 52(%[fir_coef]) \n\t"
> + "lwc1 $f4, 56(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f5, 56(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> +
> + "lwc1 $f0, 60(%[p_data]) \n\t"
> + "lwc1 $f1, 60(%[fir_coef]) \n\t"
> + "lwc1 $f2, 64(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f3, 64(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f4, 68(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> + "lwc1 $f5, 68(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> +
> + "lwc1 $f0, 72(%[p_data]) \n\t"
> + "lwc1 $f1, 72(%[fir_coef]) \n\t"
> + "lwc1 $f2, 76(%[p_data]) \n\t"
> + "lwc1 $f3, 76(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f4, 80(%[p_data]) \n\t"
> + "lwc1 $f5, 80(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> +
> + "lwc1 $f0, 84(%[p_data]) \n\t"
> + "lwc1 $f1, 84(%[fir_coef]) \n\t"
> + "lwc1 $f2, 88(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f3, 88(%[fir_coef]) \n\t"
> + "lwc1 $f4, 92(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f5, 92(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> +
> + "lwc1 $f0, 96(%[p_data]) \n\t"
> + "lwc1 $f1, 96(%[fir_coef]) \n\t"
> + "lwc1 $f2, 100(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f3, 100(%[fir_coef]) \n\t"
> + "lwc1 $f4, 104(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f5, 104(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> +
> + "lwc1 $f0, 108(%[p_data]) \n\t"
> + "lwc1 $f1, 108(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "lwc1 $f2, 112(%[p_data]) \n\t"
> + "lwc1 $f3, 112(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> + "lwc1 $f4, 116(%[p_data]) \n\t"
> + "lwc1 $f5, 116(%[fir_coef]) \n\t"
> + "lwc1 $f0, 120(%[p_data]) \n\t"
> + "madd.s %[output], %[output], $f2, $f3 \n\t"
> + "lwc1 $f1, 120(%[fir_coef]) \n\t"
> + "madd.s %[output], %[output], $f4, $f5 \n\t"
> + "madd.s %[output], %[output], $f0, $f1 \n\t"
> +
> + : [output]"+f"(output)
> + : [fir_coef]"r"(fir_coef), [p_data]"r"(p_data)
> + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5"
> + );
> + out[i] = output;
> + }
> + memcpy(mem, data + AMRWB_SFR_SIZE_16k, HB_FIR_SIZE * sizeof(float));
> +}
> +
> +av_always_inline void ff_celp_lp_zero_synthesis_filterf(float *out,
> + const float *filter_coeffs,
> + const float *in, int buffer_length,
> + int filter_length)
> +{
> + int i,n;
> + float sum_out8, sum_out7, sum_out6, sum_out5, sum_out4, fc_val;
> + float sum_out3, sum_out2, sum_out1;
> + const float *p_filter_coeffs, *p_in;
> +
> + for (n = 0; n< buffer_length; n+=8) {
> + p_in =&in[n];
> + p_filter_coeffs = filter_coeffs;
> + sum_out8 = in[n+7];
> + sum_out7 = in[n+6];
> + sum_out6 = in[n+5];
> + sum_out5 = in[n+4];
> + sum_out4 = in[n+3];
> + sum_out3 = in[n+2];
> + sum_out2 = in[n+1];
> + sum_out1 = in[n];
> + i = filter_length;
> +
> + /* i is always greater than 0
> + * inner loop is unrolled eight times so there is less memory access
> + */
> + __asm__ __volatile__(
> + "filt_lp_inner%=: \n\t"
> + "lwc1 %[fc_val], 0(%[p_filter_coeffs]) \n\t"
> + "lwc1 $f7, 6*4(%[p_in]) \n\t"
> + "lwc1 $f6, 5*4(%[p_in]) \n\t"
> + "lwc1 $f5, 4*4(%[p_in]) \n\t"
> + "lwc1 $f4, 3*4(%[p_in]) \n\t"
> + "lwc1 $f3, 2*4(%[p_in]) \n\t"
> + "lwc1 $f2, 4(%[p_in]) \n\t"
> + "lwc1 $f1, 0(%[p_in]) \n\t"
> + "lwc1 $f0, -4(%[p_in]) \n\t"
> + "addiu %[i], -2 \n\t"
> + "madd.s %[sum_out8], %[sum_out8], %[fc_val], $f7 \n\t"
> + "madd.s %[sum_out7], %[sum_out7], %[fc_val], $f6 \n\t"
> + "madd.s %[sum_out6], %[sum_out6], %[fc_val], $f5 \n\t"
> + "madd.s %[sum_out5], %[sum_out5], %[fc_val], $f4 \n\t"
> + "madd.s %[sum_out4], %[sum_out4], %[fc_val], $f3 \n\t"
> + "madd.s %[sum_out3], %[sum_out3], %[fc_val], $f2 \n\t"
> + "madd.s %[sum_out2], %[sum_out2], %[fc_val], $f1 \n\t"
> + "madd.s %[sum_out1], %[sum_out1], %[fc_val], $f0 \n\t"
> + "lwc1 %[fc_val], 4(%[p_filter_coeffs]) \n\t"
> + "mov.s $f7, $f6 \n\t"
> + "mov.s $f6, $f5 \n\t"
> + "mov.s $f5, $f4 \n\t"
> + "mov.s $f4, $f3 \n\t"
> + "mov.s $f3, $f2 \n\t"
> + "mov.s $f2, $f1 \n\t"
> + "mov.s $f1, $f0 \n\t"
If you unroll once the inner loop you will need to do all these movs
only once per iteration (thus half of the time).
> + "lwc1 $f0, -8(%[p_in]) \n\t"
> + "addiu %[p_filter_coeffs], 8 \n\t"
> + "addiu %[p_in], -8 \n\t"
> + "madd.s %[sum_out8], %[sum_out8], %[fc_val], $f7 \n\t"
> + "madd.s %[sum_out7], %[sum_out7], %[fc_val], $f6 \n\t"
> + "madd.s %[sum_out6], %[sum_out6], %[fc_val], $f5 \n\t"
> + "madd.s %[sum_out5], %[sum_out5], %[fc_val], $f4 \n\t"
> + "madd.s %[sum_out4], %[sum_out4], %[fc_val], $f3 \n\t"
> + "madd.s %[sum_out3], %[sum_out3], %[fc_val], $f2 \n\t"
> + "madd.s %[sum_out2], %[sum_out2], %[fc_val], $f1 \n\t"
> + "madd.s %[sum_out1], %[sum_out1], %[fc_val], $f0 \n\t"
> + "bgtz %[i], filt_lp_inner%= \n\t"
> +
> + : [sum_out8]"+f"(sum_out8), [sum_out7]"+f"(sum_out7),
> + [sum_out6]"+f"(sum_out6), [sum_out5]"+f"(sum_out5),
> + [sum_out4]"+f"(sum_out4), [sum_out3]"+f"(sum_out3),
> + [sum_out2]"+f"(sum_out2), [sum_out1]"+f"(sum_out1),
> + [fc_val]"=&f"(fc_val), [p_filter_coeffs]"+r"(p_filter_coeffs),
> + [p_in]"+r"(p_in), [i]"+r"(i)
> + :
> + : "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7"
> + );
> +
> + out[n+7] = sum_out8;
> + out[n+6] = sum_out7;
> + out[n+5] = sum_out6;
> + out[n+4] = sum_out5;
> + out[n+3] = sum_out4;
> + out[n+2] = sum_out3;
> + out[n+1] = sum_out2;
> + out[n] = sum_out1;
> + }
> +}
> +#endif /* AVCODEC_CELP_MATH_MIPS_H */
> diff --git a/libavcodec/mips/lsp_mips.h b/libavcodec/mips/lsp_mips.h
> new file mode 100644
> index 0000000..c91ccee
> --- /dev/null
> +++ b/libavcodec/mips/lsp_mips.h
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2012
> + * MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + * contributors may be used to endorse or promote products derived from
> + * this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author: Nedeljko Babic (nbabic at mips.com)
> + *
> + * LSP routines for ACELP-based codecs optimized for MIPS
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Reference: libavcodec/lsp.c
> + */
> +#ifndef AVCODEC_LSP_MIPS_H
> +#define AVCODEC_LSP_MIPS_H
> +
> +static av_always_inline void ff_lsp2polyf(const double *lsp, double *f, int lp_half_order)
> +{
> + int i, j = 0;
> + double * p_fi = f;
> + double * p_f = 0;
> +
> + f[0] = 1.0;
> + f[1] = -2 * lsp[0];
> + lsp -= 2;
> +
> + for(i=2; i<=lp_half_order; i++)
> + {
> + double tmp, f_j_2, f_j_1, f_j;
> + double val = lsp[2*i];
> +
> + __asm__ __volatile__(
> + "move %[p_f], %[p_fi] \n\t"
> + "add.d %[val], %[val], %[val] \n\t"
> + "addiu %[p_fi], 8 \n\t"
> + "ldc1 %[f_j_2], 0(%[p_f]) \n\t"
> + "ldc1 %[f_j_1], 8(%[p_f]) \n\t"
> + "neg.d %[val], %[val] \n\t"
You can avoid this neg by replacing val by -val everywhere (madd ->
msub, etc).
> + "add.d %[tmp], %[f_j_2], %[f_j_2] \n\t"
> + "madd.d %[tmp], %[tmp], %[f_j_1], %[val] \n\t"
> + "mov.d %[f_j], %[f_j_1] \n\t"
> + "addiu %[j], %[i], -2 \n\t"
> + "mov.d %[f_j_1], %[f_j_2] \n\t"
You can probably avoid the two mov.d by renaming the vars in the
previous instructions.
> + "ldc1 %[f_j_2], -8(%[p_f]) \n\t"
> + "sdc1 %[tmp], 16(%[p_f]) \n\t"
> + "beqz %[j], ff_lsp2polyf_lp_j_end%= \n\t"
> + "ff_lsp2polyf_lp_j%=: \n\t"
> + "add.d %[tmp], %[f_j], %[f_j_2] \n\t"
> + "madd.d %[tmp], %[tmp], %[f_j_1], %[val] \n\t"
> + "mov.d %[f_j], %[f_j_1] \n\t"
> + "addiu %[j], -1 \n\t"
> + "mov.d %[f_j_1], %[f_j_2] \n\t"
> + "ldc1 %[f_j_2], -16(%[p_f]) \n\t"
> + "sdc1 %[tmp], 8(%[p_f]) \n\t"
> + "addiu %[p_f], -8 \n\t"
> + "bgtz %[j], ff_lsp2polyf_lp_j%= \n\t"
> + "ff_lsp2polyf_lp_j_end%=: \n\t"
> +
> + : [f_j_2]"=&f"(f_j_2), [f_j_1]"=&f"(f_j_1), [val]"+f"(val),
> + [tmp]"=&f"(tmp), [f_j]"=&f"(f_j), [p_f]"+r"(p_f),
> + [j]"+r"(j), [p_fi]"+r"(p_fi)
> + : [i]"r"(i)
> + );
> + f[1] += val;
> + }
> +
> +}
-Vitor
More information about the ffmpeg-devel
mailing list