[FFmpeg-devel] [PATCH] split-radix FFT
Michael Niedermayer
michaelni
Tue Jul 29 02:59:15 CEST 2008
On Fri, Jul 25, 2008 at 08:14:00PM -0600, Loren Merritt wrote:
> $subject, vaguely based on djbfft.
> Changed from djb:
> * added simd.
> * removed the hand-scheduled pentium-pro code. gcc's output from simple C
> is better on all cpus I have access to.
> * removed the distinction between fft and ifft. they're just permutations
> of eachother, so the difference belongs in revtab[] and not in the code.
> * removed the distinction between pass() and pass_big(). C can always use
> the memory-efficient version, and simd never does because the shuffles are
> too costly.
> * made an entirely different pass_big(), to avoid store->load aliasing.
>
> I tried the tangent FFT, but I couldn't make it faster than split-radix.
> Tangent has asymptotically 5% fewer arithmetic ops, but only 1-2% for sizes
> typical of audio codecs, and even a couple extra shuffles or other overhead
> pushes it over.
>
> I tried an in-place fft_permute, but it wasn't any faster than out-of-place
> + memcpy, and quite a bit more complex.
[...]
> - for(i=0;i<n;i++) {
> - m=0;
> - for(j=0;j<nbits;j++) {
> - m |= ((i >> j) & 1) << (nbits-j-1);
> + for(i=0;i<n;i++) {
> + m=0;
> + for(j=0;j<nbits;j++) {
> + m |= ((i >> j) & 1) << (nbits-j-1);
> + }
> + s->revtab[i]=m;
> }
> - s->revtab[i]=m;
> }
the reindention should be in a seperate commit from the rest
[...]
> @@ -23,105 +22,209 @@
> #include "libavutil/x86_cpu.h"
> #include "libavcodec/dsputil.h"
>
> -static const int p1m1[2] __attribute__((aligned(8))) =
> - { 0, 1 << 31 };
> +DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
> +DECLARE_ALIGNED_8(static const int, m1p1[2]) = { 1<<31, 0 };
[...]
>
> -static const int m1p1[2] __attribute__((aligned(8))) =
> - { 1 << 31, 0 };
This should ideally as well be in a seperate commit from the functional
changes
> +#ifdef EMULATE_3DNOWEXT
> +#define PSWAPD(s,d)\
> + "movq "#s","#d"\n"\
> + "psrlq $32,"#d"\n"\
> + "punpckldq "#s","#d"\n"
> +#define PSWAPD_UNARY(s)\
> + "sub $8, %%"REG_SP"\n"\
> + "movd "#s", 4(%%"REG_SP")\n"\
> + "punpckhdq (%%"REG_SP"), "#s"\n"\
> + "add $8, %%"REG_SP"\n"
Gcc failed with a "+m" ?
> +#define ff_fft_calc_3dn2 ff_fft_calc_3dn
> +#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
> +#define ff_imdct_half_3dn2 ff_imdct_half_3dn
> +#else
> +#define PSWAPD(s,d) "pswapd "#s","#d"\n"
> +#define PSWAPD_UNARY(s) "pswapd "#s","#s"\n"
> +#endif
>
> -void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
> +#define T2(m0,m1,z0,z1)\
> + asm volatile(\
> + "movq %0, "#z0" \n"\
> + "movq "#z0", "#z1" \n"\
> + "pfadd %1, "#z0" \n"\
> + "pfsub %1, "#z1" \n"\
> + ::"m"(m0), "m"(m1)\
> + );
> +
> +#define T4(z0,z1,z2,z3,t0,t1)\
> + asm volatile(\
> + "movq "#z2", "#t1" \n" /* FIXME redundant */\
> + "movq "#z2", "#t0" \n" /* {r2,i2} */\
> + "pfsub "#z3", "#t1" \n"\
> + "pfadd "#z3", "#t0" \n" /* {t6,t5} */\
> + "pxor %0, "#t1" \n" /* {t8,t7} */\
> + "movq "#z0", "#z2" \n"\
> + PSWAPD_UNARY(t1)\
> + "pfadd "#t0", "#z0" \n" /* {r0,i0} */\
> + "pfsub "#t0", "#z2" \n" /* {r2,i2} */\
> + "movq "#z1", "#z3" \n"\
> + "pfadd "#t1", "#z1" \n" /* {r1,i1} */\
> + "pfsub "#t1", "#z3" \n" /* {r3,i3} */\
> + ::"m"(*m1p1)\
> + );
> +
> +#define LOAD(mem,mmx)\
> + asm volatile("movq %0, "#mmx ::"m"(mem))
> +
> +#define SAVE(mem,mmx)\
> + asm volatile("movq "#mmx", %0" :"=m"(mem))
> +
> +#define PUNPCK(a,b,c)\
> + asm volatile(\
> + "movq "#a", "#c"\n"\
> + "punpckldq "#b", "#a"\n"\
> + "punpckhdq "#b", "#c"\n"\
> + :);
> +
> +#define PUNPCK_MEM(a,b,c)\
> + asm volatile(\
> + "movq "#a", "#c"\n"\
> + "punpckldq %0, "#a"\n"\
> + "punpckhdq %0, "#c"\n"\
> + ::"m"(b)\
> + );
> +
> +static void fft4(FFTComplex *z)
> {
> - int ln = s->nbits;
> - long j;
> - x86_reg i;
> - long nblocks, nloops;
> - FFTComplex *p, *cptr;
> + T2(z[0], z[1], %%mm0, %%mm1);
> + LOAD(z[2], %%mm2);
> + LOAD(z[3], %%mm3);
> + T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
> + PUNPCK(%%mm0, %%mm1, %%mm4);
> + PUNPCK(%%mm2, %%mm3, %%mm5);
> + SAVE(z[0], %%mm0);
> + SAVE(z[1], %%mm4);
> + SAVE(z[2], %%mm2);
> + SAVE(z[3], %%mm5);
> +}
is there any reason why seperate asm() are chained? I think a single
asm block, or even nasm/yasm if you prefer would be better.
The way its written is almost asking for gcc to put something in between,
iam especially concerned about the -fPIC case and gcc putting all the GOT
"magic" in between the asms ...
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
The misfortune of the wise is better than the prosperity of the fool.
-- Epicurus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080729/41574ded/attachment.pgp>
More information about the ffmpeg-devel
mailing list