[FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS
Vitor Sessak
vitor1001 at gmail.com
Wed Jul 4 15:49:13 CEST 2012
On 06/25/2012 10:59 AM, Babic, Nedeljko wrote:
> Hi,
Hi and sorry for the delay, was busy getting married :-)
>>> libavcodec/dsputil.c | 1 +
>>> libavcodec/dsputil.h | 1 +
>>> libavcodec/fft.c | 1 +
>>> libavcodec/fft.h | 11 +
>>> libavcodec/fmtconvert.c | 1 +
>>> libavcodec/fmtconvert.h | 1 +
>>> libavcodec/mips/Makefile | 4 +
>>> libavcodec/mips/dsputil_mips.c | 168 +++++++++
>>> libavcodec/mips/fft_mips.c | 689 +++++++++++++++++++++++++++++++++++++
>>> libavcodec/mips/fft_table.h | 482 ++++++++++++++++++++++++++
>>> libavcodec/mips/fmtconvert_mips.c | 336 ++++++++++++++++++
>>> 11 files changed, 1695 insertions(+), 0 deletions(-)
>>> create mode 100644 libavcodec/mips/dsputil_mips.c
>>> create mode 100644 libavcodec/mips/fft_mips.c
>>> create mode 100644 libavcodec/mips/fft_table.h
>>> create mode 100644 libavcodec/mips/fmtconvert_mips.c
>>>
>>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>>> index 442b900..b7d928f 100644
>>> --- a/libavcodec/dsputil.c
>>> +++ b/libavcodec/dsputil.c
>>> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>>> if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
>>> if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
>>> if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
>>> + if (HAVE_MIPSFPU) ff_dsputil_init_mips (c, avctx);
>>
>>> --- a/libavcodec/fft.h
>>> +++ b/libavcodec/fft.h
>>> @@ -38,6 +38,16 @@
>>>
>>> typedef float FFTDouble;
>>>
>>> +#if ARCH_MIPS
>>> +enum _fftConsts{
>>> + MIN_LOG2_NFFT = 5, //!< Specifies miniumum allowed fft size
>>> + MAX_LOG2_NFFT = 12 //!< Specifies maxiumum allowed fft size
>>> +};
>>> +
>>> +#define MAX_FFT_SIZE (1<< MAX_LOG2_NFFT)
>>> +#define MIN_FFT_SIZE (1<< MAX_LOG2_NFFT)
>>> +
>>> +#endif
>>
>> MIPS-specific code should not be in common code.
>
> I will place this in appropriate MIPS file.
>
>>
>>> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
>>> new file mode 100644
>>
>> Nice, can you post the benchmarks results of "fft-
>> test -s"?
>
> Posted below
>
>>
>>> index 0000000..286c67f
>>> --- /dev/null
>>> +++ b/libavcodec/mips/fft_mips.c
>>> @@ -0,0 +1,689 @@
>>> +/*
>>> + * Copyright (c) 2012
>>> + * MIPS Technologies, Inc., California.
>>> + *
>>> + * Redistribution and use in source and binary forms, with or without
>>> + * modification, are permitted provided that the following conditions
>>> + * are met:
>>> + * 1. Redistributions of source code must retain the above copyright
>>> + * notice, this list of conditions and the following disclaimer.
>>> + * 2. Redistributions in binary form must reproduce the above copyright
>>> + * notice, this list of conditions and the following disclaimer in the
>>> + * documentation and/or other materials provided with the distribution.
>>> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
>>> + * contributors may be used to endorse or promote products derived from
>>> + * this software without specific prior written permission.
>>> + *
>>> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
>>> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>>> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>>> + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
>>> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
>>> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
>>> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
>>> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
>>> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
>>> + * SUCH DAMAGE.
>>> + *
>>> + * Author: Stanisalv Ocovaj (socovaj at mips.com)
>>> + * Author: Zoran Lukic (zoranl at mips.com)
>>> + *
>>> + * Optimized MDCT/IMDCT and FFT transforms
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +#include "config.h"
>>> +#include "libavcodec/fft.h"
>>> +#include "fft_table.h"
>>> +
>>> +/**
>>> + * FFT transform
>>> + */
>>> +
>>> +#if HAVE_INLINE_ASM
>>> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
>>> +
>>> + int nbits, i, n, num_transforms, offset, step;
>>> + int n4, n2, n34;
>>> + FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
>>> + FFTComplex *tmpz;
>>> + float w_re, w_im;
>>> + float *w_re_ptr;
>>> + const int fft_size = (1<< s->nbits);
>>> + int s_n = s->nbits;
>>> + int tem1, tem2;
>>> + float pom, pom1, pom2, pom3;
>>> + float temp, temp1, temp3, temp4;
>>> + FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
>>> + FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
>>> +
>>> + /**
>>> + *num_transforms = (0x2aab>> (16 - s->nbits)) | 1;
>>> + */
>>> + __asm__ __volatile__ (
>>> + "li %[tem1], 16 \n\t"
>>> + "sub %[s_n], %[tem1], %[s_n] \n\t"
>>> + "li %[tem2], 10923 \n\t"
>>> + "srav %[tem2], %[tem2], %[s_n] \n\t"
>>> + "ori %[num_t],%[tem2], 1 \n\t"
>>> + : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
>>> + [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
>>> + );
>>> +
>>> +
>>> + for (n=0; n<num_transforms; n++)
>>> + {
>>> + offset = fft_offsets_lut[n]<< 2;
>>> + tmpz = z + offset;
>>
>> What is the point of this LUT? If you want your input permutated in some
>> particular order, you can just use revtab struct.
>
> We use this LUT because of the somewhat irregular (when compared to radix-2 FFT) structure
> of split-radix FFT. It tells us at which starting offsets the sub-transforms of particular
> size have to be performed. It has nothing to do with input permutation, which is the same
> as in the original algorithm (we use the same revtab for that).
So, if I understand correctly, your code need a LUT and the C version
doesn't because you are not using a recursive algorithm. If this is the
case, I think it would be worth to use large static tables and
initialize them dynamically in fft_init(). That way, since you will not
make the binary bigger, you can make your code work for all the
supported FFT sizes.
>>> +
>>> + tmp1 = tmpz[0].re + tmpz[1].re;
>>> + tmp5 = tmpz[2].re + tmpz[3].re;
>>> + tmp2 = tmpz[0].im + tmpz[1].im;
>>> + tmp6 = tmpz[2].im + tmpz[3].im;
>>> + tmp3 = tmpz[0].re - tmpz[1].re;
>>> + tmp8 = tmpz[2].im - tmpz[3].im;
>>> + tmp4 = tmpz[0].im - tmpz[1].im;
>>> + tmp7 = tmpz[2].re - tmpz[3].re;
>>> +
>>> + tmpz[0].re = tmp1 + tmp5;
>>> + tmpz[2].re = tmp1 - tmp5;
>>> + tmpz[0].im = tmp2 + tmp6;
>>> + tmpz[2].im = tmp2 - tmp6;
>>> + tmpz[1].re = tmp3 + tmp8;
>>> + tmpz[3].re = tmp3 - tmp8;
>>> + tmpz[1].im = tmp4 - tmp7;
>>> + tmpz[3].im = tmp4 + tmp7;
>>> +
>>> +}
>>> +
>>> + if (fft_size< 8)
>>> + return;
>>> +
>>> + num_transforms = (num_transforms>> 1) | 1;
>>> + for (n=0; n<num_transforms; n++)
>>> + {
>>> + offset = fft_offsets_lut[n]<< 3;
>>> + tmpz = z + offset;
>>> +
>>> + __asm__ __volatile__ (
>>> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
>>> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
>>> + "lwc1 %[pom1], 56(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
>>> + "lwc1 %[pom2], 44(%[tmpz]) \n\t"
>>> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
>>> + "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
>>> + "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
>>> + "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
>>> + "lwc1 %[pom], 40(%[tmpz]) \n\t"
>>> + "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
>>> + "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
>>> + "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
>>> + "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
>>> + "lwc1 %[pom1], 44(%[tmpz]) \n\t"
>>> + "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
>>> + "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
>>> + "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
>>> + "lwc1 %[pom2], 56(%[tmpz]) \n\t"
>>> + "lwc1 %[pom3], 60(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
>>> + "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
>>> + "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
>>> + "lwc1 %[pom], 0(%[tmpz]) \n\t"
>>> + "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
>>> + "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
>>> + "lwc1 %[pom2], 4(%[tmpz]) \n\t"
>>> + "sub.s %[pom1], %[pom], %[tmp5] \n\t"
>>> + "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
>>> + "add.s %[pom3], %[pom], %[tmp5] \n\t"
>>> + "sub.s %[pom], %[pom2], %[tmp6] \n\t"
>>> + "add.s %[pom2], %[pom2], %[tmp6] \n\t"
>>> + "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
>>> + "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
>>> + "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
>>> + "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
>>> + "lwc1 %[pom1], 16(%[tmpz]) \n\t"
>>> + "lwc1 %[pom3], 20(%[tmpz]) \n\t"
>>> + "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
>>> + "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
>>> + "sub.s %[temp], %[pom1], %[tmp8] \n\t"
>>> + "add.s %[pom2], %[pom3], %[tmp7] \n\t"
>>> + "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
>>> + "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
>>> + "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
>>> + "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
>>> + "add.s %[pom1], %[pom1], %[tmp8] \n\t"
>>> + "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
>>> + "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
>>> + "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
>>> + "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
>>> + "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
>>> + "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
>>> + "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
>>> + "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
>>> + "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
>>> + "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
>>> + "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
>>> + "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
>>> + "lwc1 %[temp], 8(%[tmpz]) \n\t"
>>> + "lwc1 %[temp1],12(%[tmpz]) \n\t"
>>> + "lwc1 %[pom], 24(%[tmpz]) \n\t"
>>> + "lwc1 %[pom2], 28(%[tmpz]) \n\t"
>>> + "sub.s %[temp4],%[temp], %[tmp1] \n\t"
>>> + "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
>>> + "add.s %[temp], %[temp], %[tmp1] \n\t"
>>> + "add.s %[temp1],%[temp1], %[tmp2] \n\t"
>>> + "sub.s %[pom1], %[pom], %[tmp4] \n\t"
>>> + "add.s %[pom3], %[pom2], %[tmp3] \n\t"
>>> + "add.s %[pom], %[pom], %[tmp4] \n\t"
>>> + "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
>>> + "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
>>> + "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
>>> + "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
>>> + "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
>>> + "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
>>> + "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
>>> + "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
>>> + "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
>>> + : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>> + [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5), [tmp7]"=f"(tmp7),
>>> + [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
>>> + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
>>> + :
>>> + : "memory"
>>> + );
>>> + }
>>> +
>>> + step = 1<< (MAX_LOG2_NFFT - 4);
>>> + n4 = 4;
>>> + for (nbits=4; nbits<=s->nbits; nbits++)
>>> + {
>>> + /*
>>> + * num_transforms = (num_transforms>> 1) | 1;
>>> + */
>>> + __asm__ __volatile__ (
>>> + "sra %[num_t], %[num_t], 1 \n\t"
>>> + "ori %[num_t], %[num_t], 1 \n\t"
>>> +
>>> + : [num_t] "+r" (num_transforms)
>>> + );
>>> + n2 = 2 * n4;
>>> + n34 = 3 * n4;
>>> +
>>> + for (n=0; n<num_transforms; n++)
>>> + {
>>> + offset = fft_offsets_lut[n]<< nbits;
>>> + tmpz = z + offset;
>>> +
>>> + tmpz_n2 = tmpz + n2;
>>> + tmpz_n4 = tmpz + n4;
>>> + tmpz_n34 = tmpz + n34;
>>> +
>>> + __asm__ __volatile__ (
>>> + "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
>>> + "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
>>> + "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
>>> + "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
>>> + "lwc1 %[temp1],0(%[tmpz]) \n\t"
>>> + "lwc1 %[temp3],4(%[tmpz]) \n\t"
>>> + "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
>>> + "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
>>> + "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
>>> + "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
>>> + "sub.s %[temp], %[temp1], %[tmp5] \n\t"
>>> + "add.s %[temp1],%[temp1], %[tmp5] \n\t"
>>> + "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
>>> + "add.s %[temp3],%[temp3], %[tmp6] \n\t"
>>> + "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
>>> + "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
>>> + "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
>>> + "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
>>> + "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
>>> + "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
>>> + "sub.s %[pom], %[pom1], %[tmp2] \n\t"
>>> + "add.s %[pom1], %[pom1], %[tmp2] \n\t"
>>> + "add.s %[temp1],%[temp], %[tmp1] \n\t"
>>> + "sub.s %[temp], %[temp], %[tmp1] \n\t"
>>> + "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
>>> + "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
>>> + "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
>>> + "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
>>> + : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
>>> + [tmp1]"=f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>> + [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6), [tmpz_n4]"+r"(tmpz_n4), [pom3]"=&f"(pom3),
>>> + [temp]"=f"(temp), [temp1]"=f"(temp1), [temp3]"=f"(temp3), [temp4]"=f"(temp4)
>>> + :
>>> + : "memory"
>>> + );
>>> +
>>> + w_re_ptr = w_tab + step;
>>> +
>>> + for (i=1; i<n4; i++)
>>> + {
>>> + w_re = w_re_ptr[0];
>>> + w_im = w_re_ptr[MAX_FFT_SIZE/4];
>>
>> Can you explain why you cannot use the same cos/sin tab that the C
>> version uses?
> We can use them and I will rewrite this part of code to use them.
>
> ========================
> fft-test -s results on MIPS 74Kf board:
> ========================
> original fft:
>
> FFT 512 test
> Checking...
> max:0.000008 e:3.92148e-08
> Speed test...
> time: 136.9 us/transform [total time=1.12 s its=8192]
>
> ========================================================
> optimized fft:
>
> FFT 512 test
> Checking...
> max:0.000005 e:3.86258e-08
> Speed test...
> time: 89.7 us/transform [total time=1.47 s its=16384]
Pretty impressive :-D
-Vitor
More information about the ffmpeg-devel
mailing list