[FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS
Babic, Nedeljko
nbabic at mips.com
Wed Jul 11 13:00:10 CEST 2012
Hello Vittor,
>> Hi,
>
>Hi and sorry for the delay, was busy getting married :-)
Congratulations :)
>
>>>> libavcodec/dsputil.c | 1 +
>>>> libavcodec/dsputil.h | 1 +
>>>> libavcodec/fft.c | 1 +
>>>> libavcodec/fft.h | 11 +
>>>> libavcodec/fmtconvert.c | 1 +
>>>> libavcodec/fmtconvert.h | 1 +
>>>> libavcodec/mips/Makefile | 4 +
>>>> libavcodec/mips/dsputil_mips.c | 168 +++++++++
>>>> libavcodec/mips/fft_mips.c | 689 +++++++++++++++++++++++++++++++++++++
>>>> libavcodec/mips/fft_table.h | 482 ++++++++++++++++++++++++++
>>>> libavcodec/mips/fmtconvert_mips.c | 336 ++++++++++++++++++
>>>> 11 files changed, 1695 insertions(+), 0 deletions(-)
>>>> create mode 100644 libavcodec/mips/dsputil_mips.c
>>>> create mode 100644 libavcodec/mips/fft_mips.c
>>>> create mode 100644 libavcodec/mips/fft_table.h
>>>> create mode 100644 libavcodec/mips/fmtconvert_mips.c
>>>>
>>>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>>>> index 442b900..b7d928f 100644
>>>> --- a/libavcodec/dsputil.c
>>>> +++ b/libavcodec/dsputil.c
>>>> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>>>> if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
>>>> if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
>>>> if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
>>>> + if (HAVE_MIPSFPU) ff_dsputil_init_mips (c, avctx);
>>>
>>>> --- a/libavcodec/fft.h
>>>> +++ b/libavcodec/fft.h
>>>> @@ -38,6 +38,16 @@
>>>>
>>>> typedef float FFTDouble;
>>>>
>>>> +#if ARCH_MIPS
>>>> +enum _fftConsts{
>>>> + MIN_LOG2_NFFT = 5, //!< Specifies miniumum allowed fft size
>>>> + MAX_LOG2_NFFT = 12 //!< Specifies maxiumum allowed fft size
>>>> +};
>>>> +
>>>> +#define MAX_FFT_SIZE (1<< MAX_LOG2_NFFT)
>>>> +#define MIN_FFT_SIZE (1<< MAX_LOG2_NFFT)
>>>> +
>>>> +#endif
>>>
>>> MIPS-specific code should not be in common code.
>>
>> I will place this in appropriate MIPS file.
>>
>>>
>>>> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
>>>> new file mode 100644
>>>
>>> Nice, can you post the benchmarks results of "fft-
>>> test -s"?
>>
>> Posted below
>>
>>>
>>>> index 0000000..286c67f
>>>> --- /dev/null
>>>> +++ b/libavcodec/mips/fft_mips.c
>>>> @@ -0,0 +1,689 @@
>>>> +/*
>>>> + * Copyright (c) 2012
>>>> + * MIPS Technologies, Inc., California.
>>>> + *
>>>> + * Redistribution and use in source and binary forms, with or without
>>>> + * modification, are permitted provided that the following conditions
>>>> + * are met:
>>>> + * 1. Redistributions of source code must retain the above copyright
>>>> + * notice, this list of conditions and the following disclaimer.
>>>> + * 2. Redistributions in binary form must reproduce the above copyright
>>>> + * notice, this list of conditions and the following disclaimer in the
>>>> + * documentation and/or other materials provided with the distribution.
>>>> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
>>>> + * contributors may be used to endorse or promote products derived from
>>>> + * this software without specific prior written permission.
>>>> + *
>>>> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
>>>> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>>>> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>>>> + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
>>>> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
>>>> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
>>>> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
>>>> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>>>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
>>>> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
>>>> + * SUCH DAMAGE.
>>>> + *
>>>> + * Author: Stanisalv Ocovaj (socovaj at mips.com)
>>>> + * Author: Zoran Lukic (zoranl at mips.com)
>>>> + *
>>>> + * Optimized MDCT/IMDCT and FFT transforms
>>>> + *
>>>> + * This file is part of FFmpeg.
>>>> + *
>>>> + * FFmpeg is free software; you can redistribute it and/or
>>>> + * modify it under the terms of the GNU Lesser General Public
>>>> + * License as published by the Free Software Foundation; either
>>>> + * version 2.1 of the License, or (at your option) any later version.
>>>> + *
>>>> + * FFmpeg is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>>> + * Lesser General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU Lesser General Public
>>>> + * License along with FFmpeg; if not, write to the Free Software
>>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>>> + */
>>>> +#include "config.h"
>>>> +#include "libavcodec/fft.h"
>>>> +#include "fft_table.h"
>>>> +
>>>> +/**
>>>> + * FFT transform
>>>> + */
>>>> +
>>>> +#if HAVE_INLINE_ASM
>>>> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
>>>> +
>>>> + int nbits, i, n, num_transforms, offset, step;
>>>> + int n4, n2, n34;
>>>> + FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
>>>> + FFTComplex *tmpz;
>>>> + float w_re, w_im;
>>>> + float *w_re_ptr;
>>>> + const int fft_size = (1<< s->nbits);
>>>> + int s_n = s->nbits;
>>>> + int tem1, tem2;
>>>> + float pom, pom1, pom2, pom3;
>>>> + float temp, temp1, temp3, temp4;
>>>> + FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
>>>> + FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
>>>> +
>>>> + /**
>>>> + *num_transforms = (0x2aab>> (16 - s->nbits)) | 1;
>>>> + */
>>>> + __asm__ __volatile__ (
>>>> + "li %[tem1], 16 \n\t"
>>>> + "sub %[s_n], %[tem1], %[s_n] \n\t"
>>>> + "li %[tem2], 10923 \n\t"
>>>> + "srav %[tem2], %[tem2], %[s_n] \n\t"
>>>> + "ori %[num_t],%[tem2], 1 \n\t"
>>>> + : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
>>>> + [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
>>>> + );
>>>> +
>>>> +
>>>> + for (n=0; n<num_transforms; n++)
>>>> + {
>>>> + offset = fft_offsets_lut[n]<< 2;
>>>> + tmpz = z + offset;
>>>
>>> What is the point of this LUT? If you want your input permutated in some
>>> particular order, you can just use revtab struct.
>>
>> We use this LUT because of the somewhat irregular (when compared to radix-2 FFT) structure
>> of split-radix FFT. It tells us at which starting offsets the sub-transforms of particular
>> size have to be performed. It has nothing to do with input permutation, which is the same
>> as in the original algorithm (we use the same revtab for that).
>
>So, if I understand correctly, your code need a LUT and the C version
>doesn't because you are not using a recursive algorithm. If this is the
>case, I think it would be worth to use large static tables and
>initialize them dynamically in fft_init(). That way, since you will not
>make the binary bigger, you can make your code work for all the
>supported FFT sizes.
You are correct and code will be changed according to your suggestion.
>
>>>> +
>>>> + tmp1 = tmpz[0].re + tmpz[1].re;
>>>> + tmp5 = tmpz[2].re + tmpz[3].re;
>>>> + tmp2 = tmpz[0].im + tmpz[1].im;
>>>> + tmp6 = tmpz[2].im + tmpz[3].im;
>>>> + tmp3 = tmpz[0].re - tmpz[1].re;
>>>> + tmp8 = tmpz[2].im - tmpz[3].im;
>>>> + tmp4 = tmpz[0].im - tmpz[1].im;
>>>> + tmp7 = tmpz[2].re - tmpz[3].re;
>>>> +
>>>> + tmpz[0].re = tmp1 + tmp5;
>>>> + tmpz[2].re = tmp1 - tmp5;
>>>> + tmpz[0].im = tmp2 + tmp6;
>>>> + tmpz[2].im = tmp2 - tmp6;
>>>> + tmpz[1].re = tmp3 + tmp8;
>>>> + tmpz[3].re = tmp3 - tmp8;
>>>> + tmpz[1].im = tmp4 - tmp7;
>>>> + tmpz[3].im = tmp4 + tmp7;
>>>> +
>>>> +}
>>>> +
>>>> + if (fft_size< 8)
>>>> + return;
>>>> +
>>>> + num_transforms = (num_transforms>> 1) | 1;
>>>> + for (n=0; n<num_transforms; n++)
>>>> + {
>>>> + offset = fft_offsets_lut[n]<< 3;
>>>> + tmpz = z + offset;
>>>> +
>>>> + __asm__ __volatile__ (
>>>> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
>>>> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom1], 56(%[tmpz]) \n\t"
>>>> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom2], 44(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
>>>> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
>>>> + "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
>>>> + "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
>>>> + "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
>>>> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
>>>> + "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
>>>> + "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
>>>> + "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
>>>> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom1], 44(%[tmpz]) \n\t"
>>>> + "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
>>>> + "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
>>>> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom2], 56(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
>>>> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
>>>> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
>>>> + "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
>>>> + "lwc1 %[pom], 0(%[tmpz]) \n\t"
>>>> + "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
>>>> + "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
>>>> + "lwc1 %[pom2], 4(%[tmpz]) \n\t"
>>>> + "sub.s %[pom1], %[pom], %[tmp5] \n\t"
>>>> + "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
>>>> + "add.s %[pom3], %[pom], %[tmp5] \n\t"
>>>> + "sub.s %[pom], %[pom2], %[tmp6] \n\t"
>>>> + "add.s %[pom2], %[pom2], %[tmp6] \n\t"
>>>> + "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
>>>> + "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
>>>> + "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
>>>> + "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
>>>> + "lwc1 %[pom1], 16(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom3], 20(%[tmpz]) \n\t"
>>>> + "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
>>>> + "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
>>>> + "sub.s %[temp], %[pom1], %[tmp8] \n\t"
>>>> + "add.s %[pom2], %[pom3], %[tmp7] \n\t"
>>>> + "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
>>>> + "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
>>>> + "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
>>>> + "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
>>>> + "add.s %[pom1], %[pom1], %[tmp8] \n\t"
>>>> + "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
>>>> + "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
>>>> + "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
>>>> + "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
>>>> + "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
>>>> + "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
>>>> + "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
>>>> + "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
>>>> + "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
>>>> + "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
>>>> + "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
>>>> + "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
>>>> + "lwc1 %[temp], 8(%[tmpz]) \n\t"
>>>> + "lwc1 %[temp1],12(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom], 24(%[tmpz]) \n\t"
>>>> + "lwc1 %[pom2], 28(%[tmpz]) \n\t"
>>>> + "sub.s %[temp4],%[temp], %[tmp1] \n\t"
>>>> + "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
>>>> + "add.s %[temp], %[temp], %[tmp1] \n\t"
>>>> + "add.s %[temp1],%[temp1], %[tmp2] \n\t"
>>>> + "sub.s %[pom1], %[pom], %[tmp4] \n\t"
>>>> + "add.s %[pom3], %[pom2], %[tmp3] \n\t"
>>>> + "add.s %[pom], %[pom], %[tmp4] \n\t"
>>>> + "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
>>>> + "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
>>>> + "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
>>>> + "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
>>>> + "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
>>>> + "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
>>>> + "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
>>>> + "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
>>>> + "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
>>>> + : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>>> + [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5), [tmp7]"=f"(tmp7),
>>>> + [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
>>>> + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
>>>> + :
>>>> + : "memory"
>>>> + );
>>>> + }
>>>> +
>>>> + step = 1<< (MAX_LOG2_NFFT - 4);
>>>> + n4 = 4;
>>>> + for (nbits=4; nbits<=s->nbits; nbits++)
>>>> + {
>>>> + /*
>>>> + * num_transforms = (num_transforms>> 1) | 1;
>>>> + */
>>>> + __asm__ __volatile__ (
>>>> + "sra %[num_t], %[num_t], 1 \n\t"
>>>> + "ori %[num_t], %[num_t], 1 \n\t"
>>>> +
>>>> + : [num_t] "+r" (num_transforms)
>>>> + );
>>>> + n2 = 2 * n4;
>>>> + n34 = 3 * n4;
>>>> +
>>>> + for (n=0; n<num_transforms; n++)
>>>> + {
>>>> + offset = fft_offsets_lut[n]<< nbits;
>>>> + tmpz = z + offset;
>>>> +
>>>> + tmpz_n2 = tmpz + n2;
>>>> + tmpz_n4 = tmpz + n4;
>>>> + tmpz_n34 = tmpz + n34;
>>>> +
>>>> + __asm__ __volatile__ (
>>>> + "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
>>>> + "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
>>>> + "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
>>>> + "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
>>>> + "lwc1 %[temp1],0(%[tmpz]) \n\t"
>>>> + "lwc1 %[temp3],4(%[tmpz]) \n\t"
>>>> + "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
>>>> + "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
>>>> + "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
>>>> + "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
>>>> + "sub.s %[temp], %[temp1], %[tmp5] \n\t"
>>>> + "add.s %[temp1],%[temp1], %[tmp5] \n\t"
>>>> + "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
>>>> + "add.s %[temp3],%[temp3], %[tmp6] \n\t"
>>>> + "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
>>>> + "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
>>>> + "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
>>>> + "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
>>>> + "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
>>>> + "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
>>>> + "sub.s %[pom], %[pom1], %[tmp2] \n\t"
>>>> + "add.s %[pom1], %[pom1], %[tmp2] \n\t"
>>>> + "add.s %[temp1],%[temp], %[tmp1] \n\t"
>>>> + "sub.s %[temp], %[temp], %[tmp1] \n\t"
>>>> + "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
>>>> + "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
>>>> + "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
>>>> + "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
>>>> + : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
>>>> + [tmp1]"=f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>>> + [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6), [tmpz_n4]"+r"(tmpz_n4), [pom3]"=&f"(pom3),
>>>> + [temp]"=f"(temp), [temp1]"=f"(temp1), [temp3]"=f"(temp3), [temp4]"=f"(temp4)
>>>> + :
>>>> + : "memory"
>>>> + );
>>>> +
>>>> + w_re_ptr = w_tab + step;
>>>> +
>>>> + for (i=1; i<n4; i++)
>>>> + {
>>>> + w_re = w_re_ptr[0];
>>>> + w_im = w_re_ptr[MAX_FFT_SIZE/4];
>>>
>>> Can you explain why you cannot use the same cos/sin tab that the C
>>> version uses?
>> We can use them and I will rewrite this part of code to use them.
>>
>> ========================
>> fft-test -s results on MIPS 74Kf board:
>> ========================
>> original fft:
>>
>> FFT 512 test
>> Checking...
>> max:0.000008 e:3.92148e-08
>> Speed test...
>> time: 136.9 us/transform [total time=1.12 s its=8192]
>>
>> ========================================================
>> optimized fft:
>>
>> FFT 512 test
>> Checking...
>> max:0.000005 e:3.86258e-08
>> Speed test...
>> time: 89.7 us/transform [total time=1.47 s its=16384]
>
>Pretty impressive :-D
Thanks.
The guys working on this did a good job :)
-Nedeljko
________________________________________
From: ffmpeg-devel-bounces at ffmpeg.org [ffmpeg-devel-bounces at ffmpeg.org] on behalf of Vitor Sessak [vitor1001 at gmail.com]
Sent: Wednesday, July 04, 2012 15:49
To: Babic, Nedeljko
Cc: Lukac, Zeljko; FFmpeg development discussions and patches
Subject: Re: [FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS
On 06/25/2012 10:59 AM, Babic, Nedeljko wrote:
> Hi,
Hi and sorry for the delay, was busy getting married :-)
>>> libavcodec/dsputil.c | 1 +
>>> libavcodec/dsputil.h | 1 +
>>> libavcodec/fft.c | 1 +
>>> libavcodec/fft.h | 11 +
>>> libavcodec/fmtconvert.c | 1 +
>>> libavcodec/fmtconvert.h | 1 +
>>> libavcodec/mips/Makefile | 4 +
>>> libavcodec/mips/dsputil_mips.c | 168 +++++++++
>>> libavcodec/mips/fft_mips.c | 689 +++++++++++++++++++++++++++++++++++++
>>> libavcodec/mips/fft_table.h | 482 ++++++++++++++++++++++++++
>>> libavcodec/mips/fmtconvert_mips.c | 336 ++++++++++++++++++
>>> 11 files changed, 1695 insertions(+), 0 deletions(-)
>>> create mode 100644 libavcodec/mips/dsputil_mips.c
>>> create mode 100644 libavcodec/mips/fft_mips.c
>>> create mode 100644 libavcodec/mips/fft_table.h
>>> create mode 100644 libavcodec/mips/fmtconvert_mips.c
>>>
>>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>>> index 442b900..b7d928f 100644
>>> --- a/libavcodec/dsputil.c
>>> +++ b/libavcodec/dsputil.c
>>> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>>> if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
>>> if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
>>> if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
>>> + if (HAVE_MIPSFPU) ff_dsputil_init_mips (c, avctx);
>>
>>> --- a/libavcodec/fft.h
>>> +++ b/libavcodec/fft.h
>>> @@ -38,6 +38,16 @@
>>>
>>> typedef float FFTDouble;
>>>
>>> +#if ARCH_MIPS
>>> +enum _fftConsts{
>>> + MIN_LOG2_NFFT = 5, //!< Specifies miniumum allowed fft size
>>> + MAX_LOG2_NFFT = 12 //!< Specifies maxiumum allowed fft size
>>> +};
>>> +
>>> +#define MAX_FFT_SIZE (1<< MAX_LOG2_NFFT)
>>> +#define MIN_FFT_SIZE (1<< MAX_LOG2_NFFT)
>>> +
>>> +#endif
>>
>> MIPS-specific code should not be in common code.
>
> I will place this in appropriate MIPS file.
>
>>
>>> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
>>> new file mode 100644
>>
>> Nice, can you post the benchmarks results of "fft-
>> test -s"?
>
> Posted below
>
>>
>>> index 0000000..286c67f
>>> --- /dev/null
>>> +++ b/libavcodec/mips/fft_mips.c
>>> @@ -0,0 +1,689 @@
>>> +/*
>>> + * Copyright (c) 2012
>>> + * MIPS Technologies, Inc., California.
>>> + *
>>> + * Redistribution and use in source and binary forms, with or without
>>> + * modification, are permitted provided that the following conditions
>>> + * are met:
>>> + * 1. Redistributions of source code must retain the above copyright
>>> + * notice, this list of conditions and the following disclaimer.
>>> + * 2. Redistributions in binary form must reproduce the above copyright
>>> + * notice, this list of conditions and the following disclaimer in the
>>> + * documentation and/or other materials provided with the distribution.
>>> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
>>> + * contributors may be used to endorse or promote products derived from
>>> + * this software without specific prior written permission.
>>> + *
>>> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
>>> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>>> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>>> + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
>>> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
>>> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
>>> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
>>> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
>>> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
>>> + * SUCH DAMAGE.
>>> + *
>>> + * Author: Stanisalv Ocovaj (socovaj at mips.com)
>>> + * Author: Zoran Lukic (zoranl at mips.com)
>>> + *
>>> + * Optimized MDCT/IMDCT and FFT transforms
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +#include "config.h"
>>> +#include "libavcodec/fft.h"
>>> +#include "fft_table.h"
>>> +
>>> +/**
>>> + * FFT transform
>>> + */
>>> +
>>> +#if HAVE_INLINE_ASM
>>> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
>>> +
>>> + int nbits, i, n, num_transforms, offset, step;
>>> + int n4, n2, n34;
>>> + FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
>>> + FFTComplex *tmpz;
>>> + float w_re, w_im;
>>> + float *w_re_ptr;
>>> + const int fft_size = (1<< s->nbits);
>>> + int s_n = s->nbits;
>>> + int tem1, tem2;
>>> + float pom, pom1, pom2, pom3;
>>> + float temp, temp1, temp3, temp4;
>>> + FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
>>> + FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
>>> +
>>> + /**
>>> + *num_transforms = (0x2aab>> (16 - s->nbits)) | 1;
>>> + */
>>> + __asm__ __volatile__ (
>>> + "li %[tem1], 16 \n\t"
>>> + "sub %[s_n], %[tem1], %[s_n] \n\t"
>>> + "li %[tem2], 10923 \n\t"
>>> + "srav %[tem2], %[tem2], %[s_n] \n\t"
>>> + "ori %[num_t],%[tem2], 1 \n\t"
>>> + : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
>>> + [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
>>> + );
>>> +
>>> +
>>> + for (n=0; n<num_transforms; n++)
>>> + {
>>> + offset = fft_offsets_lut[n]<< 2;
>>> + tmpz = z + offset;
>>
>> What is the point of this LUT? If you want your input permutated in some
>> particular order, you can just use revtab struct.
>
> We use this LUT because of the somewhat irregular (when compared to radix-2 FFT) structure
> of split-radix FFT. It tells us at which starting offsets the sub-transforms of particular
> size have to be performed. It has nothing to do with input permutation, which is the same
> as in the original algorithm (we use the same revtab for that).
So, if I understand correctly, your code need a LUT and the C version
doesn't because you are not using a recursive algorithm. If this is the
case, I think it would be worth to use large static tables and
initialize them dynamically in fft_init(). That way, since you will not
make the binary bigger, you can make your code work for all the
supported FFT sizes.
>>> +
>>> + tmp1 = tmpz[0].re + tmpz[1].re;
>>> + tmp5 = tmpz[2].re + tmpz[3].re;
>>> + tmp2 = tmpz[0].im + tmpz[1].im;
>>> + tmp6 = tmpz[2].im + tmpz[3].im;
>>> + tmp3 = tmpz[0].re - tmpz[1].re;
>>> + tmp8 = tmpz[2].im - tmpz[3].im;
>>> + tmp4 = tmpz[0].im - tmpz[1].im;
>>> + tmp7 = tmpz[2].re - tmpz[3].re;
>>> +
>>> + tmpz[0].re = tmp1 + tmp5;
>>> + tmpz[2].re = tmp1 - tmp5;
>>> + tmpz[0].im = tmp2 + tmp6;
>>> + tmpz[2].im = tmp2 - tmp6;
>>> + tmpz[1].re = tmp3 + tmp8;
>>> + tmpz[3].re = tmp3 - tmp8;
>>> + tmpz[1].im = tmp4 - tmp7;
>>> + tmpz[3].im = tmp4 + tmp7;
>>> +
>>> +}
>>> +
>>> + if (fft_size< 8)
>>> + return;
>>> +
>>> + num_transforms = (num_transforms>> 1) | 1;
>>> + for (n=0; n<num_transforms; n++)
>>> + {
>>> + offset = fft_offsets_lut[n]<< 3;
>>> + tmpz = z + offset;
>>> +
>>> + __asm__ __volatile__ (
>>> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
>>> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
>>> + "lwc1 %[pom1], 56(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
>>> + "lwc1 %[pom2], 44(%[tmpz]) \n\t"
>>> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
>>> + "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
>>> + "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
>>> + "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
>>> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
>>> + "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
>>> + "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
>>> + "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
>>> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
>>> + "lwc1 %[pom1], 44(%[tmpz]) \n\t"
>>> + "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
>>> + "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
>>> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
>>> + "lwc1 %[pom2], 56(%[tmpz]) \n\t"
>>> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
>>> + "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
>>> + "lwc1 %[pom], 0(%[tmpz]) \n\t"
>>> + "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
>>> + "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
>>> + "lwc1 %[pom2], 4(%[tmpz]) \n\t"
>>> + "sub.s %[pom1], %[pom], %[tmp5] \n\t"
>>> + "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
>>> + "add.s %[pom3], %[pom], %[tmp5] \n\t"
>>> + "sub.s %[pom], %[pom2], %[tmp6] \n\t"
>>> + "add.s %[pom2], %[pom2], %[tmp6] \n\t"
>>> + "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
>>> + "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
>>> + "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
>>> + "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
>>> + "lwc1 %[pom1], 16(%[tmpz]) \n\t"
>>> + "lwc1 %[pom3], 20(%[tmpz]) \n\t"
>>> + "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
>>> + "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
>>> + "sub.s %[temp], %[pom1], %[tmp8] \n\t"
>>> + "add.s %[pom2], %[pom3], %[tmp7] \n\t"
>>> + "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
>>> + "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
>>> + "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
>>> + "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
>>> + "add.s %[pom1], %[pom1], %[tmp8] \n\t"
>>> + "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
>>> + "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
>>> + "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
>>> + "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
>>> + "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
>>> + "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
>>> + "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
>>> + "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
>>> + "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
>>> + "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
>>> + "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
>>> + "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
>>> + "lwc1 %[temp], 8(%[tmpz]) \n\t"
>>> + "lwc1 %[temp1],12(%[tmpz]) \n\t"
>>> + "lwc1 %[pom], 24(%[tmpz]) \n\t"
>>> + "lwc1 %[pom2], 28(%[tmpz]) \n\t"
>>> + "sub.s %[temp4],%[temp], %[tmp1] \n\t"
>>> + "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
>>> + "add.s %[temp], %[temp], %[tmp1] \n\t"
>>> + "add.s %[temp1],%[temp1], %[tmp2] \n\t"
>>> + "sub.s %[pom1], %[pom], %[tmp4] \n\t"
>>> + "add.s %[pom3], %[pom2], %[tmp3] \n\t"
>>> + "add.s %[pom], %[pom], %[tmp4] \n\t"
>>> + "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
>>> + "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
>>> + "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
>>> + "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
>>> + "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
>>> + "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
>>> + "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
>>> + "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
>>> + "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
>>> + : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>> + [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5), [tmp7]"=f"(tmp7),
>>> + [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
>>> + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
>>> + :
>>> + : "memory"
>>> + );
>>> + }
>>> +
>>> + step = 1<< (MAX_LOG2_NFFT - 4);
>>> + n4 = 4;
>>> + for (nbits=4; nbits<=s->nbits; nbits++)
>>> + {
>>> + /*
>>> + * num_transforms = (num_transforms>> 1) | 1;
>>> + */
>>> + __asm__ __volatile__ (
>>> + "sra %[num_t], %[num_t], 1 \n\t"
>>> + "ori %[num_t], %[num_t], 1 \n\t"
>>> +
>>> + : [num_t] "+r" (num_transforms)
>>> + );
>>> + n2 = 2 * n4;
>>> + n34 = 3 * n4;
>>> +
>>> + for (n=0; n<num_transforms; n++)
>>> + {
>>> + offset = fft_offsets_lut[n]<< nbits;
>>> + tmpz = z + offset;
>>> +
>>> + tmpz_n2 = tmpz + n2;
>>> + tmpz_n4 = tmpz + n4;
>>> + tmpz_n34 = tmpz + n34;
>>> +
>>> + __asm__ __volatile__ (
>>> + "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
>>> + "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
>>> + "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
>>> + "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
>>> + "lwc1 %[temp1],0(%[tmpz]) \n\t"
>>> + "lwc1 %[temp3],4(%[tmpz]) \n\t"
>>> + "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
>>> + "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
>>> + "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
>>> + "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
>>> + "sub.s %[temp], %[temp1], %[tmp5] \n\t"
>>> + "add.s %[temp1],%[temp1], %[tmp5] \n\t"
>>> + "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
>>> + "add.s %[temp3],%[temp3], %[tmp6] \n\t"
>>> + "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
>>> + "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
>>> + "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
>>> + "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
>>> + "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
>>> + "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
>>> + "sub.s %[pom], %[pom1], %[tmp2] \n\t"
>>> + "add.s %[pom1], %[pom1], %[tmp2] \n\t"
>>> + "add.s %[temp1],%[temp], %[tmp1] \n\t"
>>> + "sub.s %[temp], %[temp], %[tmp1] \n\t"
>>> + "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
>>> + "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
>>> + "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
>>> + "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
>>> + : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
>>> + [tmp1]"=f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>> + [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6), [tmpz_n4]"+r"(tmpz_n4), [pom3]"=&f"(pom3),
>>> + [temp]"=f"(temp), [temp1]"=f"(temp1), [temp3]"=f"(temp3), [temp4]"=f"(temp4)
>>> + :
>>> + : "memory"
>>> + );
>>> +
>>> + w_re_ptr = w_tab + step;
>>> +
>>> + for (i=1; i<n4; i++)
>>> + {
>>> + w_re = w_re_ptr[0];
>>> + w_im = w_re_ptr[MAX_FFT_SIZE/4];
>>
>> Can you explain why you cannot use the same cos/sin tab that the C
>> version uses?
> We can use them and I will rewrite this part of code to use them.
>
> ========================
> fft-test -s results on MIPS 74Kf board:
> ========================
> original fft:
>
> FFT 512 test
> Checking...
> max:0.000008 e:3.92148e-08
> Speed test...
> time: 136.9 us/transform [total time=1.12 s its=8192]
>
> ========================================================
> optimized fft:
>
> FFT 512 test
> Checking...
> max:0.000005 e:3.86258e-08
> Speed test...
> time: 89.7 us/transform [total time=1.47 s its=16384]
Pretty impressive :-D
-Vitor
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel at ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
More information about the ffmpeg-devel
mailing list