[FFmpeg-devel] [PATCH] split-radix FFT
Michael Niedermayer
michaelni
Tue Jul 29 13:43:43 CEST 2008
On Tue, Jul 29, 2008 at 12:22:59AM -0600, Loren Merritt wrote:
> AOn Tue, 29 Jul 2008, Michael Niedermayer wrote:
> > On Fri, Jul 25, 2008 at 08:14:00PM -0600, Loren Merritt wrote:
> >
> >> +#ifdef EMULATE_3DNOWEXT
> >> +#define PSWAPD(s,d)\
> >> + "movq "#s","#d"\n"\
> >> + "psrlq $32,"#d"\n"\
> >> + "punpckldq "#s","#d"\n"
> >
> >> +#define PSWAPD_UNARY(s)\
> >> + "sub $8, %%"REG_SP"\n"\
> >> + "movd "#s", 4(%%"REG_SP")\n"\
> >> + "punpckhdq (%%"REG_SP"), "#s"\n"\
> >> + "add $8, %%"REG_SP"\n"
> >
> > Gcc failed with a "+m" ?
>
> No, I just designed the 3dn1 emulation of 3dn2 for simplicity (including
> code locality) rather than speed. I wouldn't have written it at all
> except that then I wouldn't be able to delete the radix-2 init code.
> (I still can't delete it until someone ports split-radix to altivec,
> but I assume that'll happen.)
>
> >> +static void fft4(FFTComplex *z)
> >> {
> >> - int ln = s->nbits;
> >> - long j;
> >> - x86_reg i;
> >> - long nblocks, nloops;
> >> - FFTComplex *p, *cptr;
> >> + T2(z[0], z[1], %%mm0, %%mm1);
> >> + LOAD(z[2], %%mm2);
> >> + LOAD(z[3], %%mm3);
> >> + T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
> >> + PUNPCK(%%mm0, %%mm1, %%mm4);
> >> + PUNPCK(%%mm2, %%mm3, %%mm5);
> >> + SAVE(z[0], %%mm0);
> >> + SAVE(z[1], %%mm4);
> >> + SAVE(z[2], %%mm2);
> >> + SAVE(z[3], %%mm5);
> >> +}
> >
> > is there any reason why seperate asm() are chained? I think a single
> > asm block, or even nasm/yasm if you prefer would be better.
>
> Because it works for me, and I don't see any alternatives that are as
> concise.
> yasm, ok.
I prefer code that is easy to maintain over concise code, and code that gcc
can silently pessimize is not easy to maintain IMHO. It easily can cost
someone quite some time to debug why some codec is slower on some gcc
version or compiled with different flags ...
It would of course be different if such "silent pessimization" where just
hypothetical but it isnt, gcc is really following murphis law here, if it
can mess up it does.
Thats why i would strogly prefer if gcc couldnt put anything at all between
the asm parts ...
>
> > The way its written is almost asking for gcc to put something in between,
> > iam especially concerned about the -fPIC case and gcc putting all the GOT
> > "magic" in between the asms ...
>
> Is gcc so stupid as to emit GOT stuff when dereferencing a pointer that's
> already in a register, no global variables involved?
yes, examples below with gcc 4.3.1
fft8:
.LFB92:
.loc 1 97 0
.LVL1:
call __i686.get_pc_thunk.cx # 30 set_got [length = 12]
addl $_GLOBAL_OFFSET_TABLE_, %ecx
.loc 1 97 0
movl 4(%esp), %eax # 2 *movsi_1/1 [length = 4]
.loc 1 98 0
#APP
# 98 "libavcodec/i386/fft_sse.c" 1
movaps (%eax), %xmm0
# 0 "" 2
.loc 1 99 0
# 99 "libavcodec/i386/fft_sse.c" 1
movaps 16(%eax), %xmm2
# 0 "" 2
.loc 1 100 0
# 100 "libavcodec/i386/fft_sse.c" 1
movaps %xmm0, %xmm1
shufps $0x64, %xmm2, %xmm0
shufps $0xce, %xmm2, %xmm1
movaps %xmm0, %xmm2
addps %xmm1, %xmm0
....
fft16:
.LFB93:
.loc 1 111 0
.LVL2:
call __i686.get_pc_thunk.cx # 38 set_got [length = 12]
addl $_GLOBAL_OFFSET_TABLE_, %ecx
.loc 1 111 0
movl 4(%esp), %eax # 2 *movsi_1/1 [length = 4]
.loc 1 112 0
#APP
# 112 "libavcodec/i386/fft_sse.c" 1
movaps (%eax), %xmm0
# 0 "" 2
.loc 1 113 0
# 113 "libavcodec/i386/fft_sse.c" 1
movaps 16(%eax), %xmm2
# 0 "" 2
.loc 1 114 0
# 114 "libavcodec/i386/fft_sse.c" 1
movaps %xmm0, %xmm1
shufps $0x64, %xmm2, %xmm0
shufps $0xce, %xmm2, %xmm1
.....
addps %xmm7, %xmm6
subps %xmm7, %xmm0
movaps %xmm6, %xmm7
shufps $0x88, %xmm0, %xmm6
shufps $0xdd, %xmm0, %xmm7
# 0 "" 2
.loc 1 129 0
#NO_APP
movl ff_cos_16 at GOT(%ecx), %edx # 27 *movsi_1/1 [length = 6]
#APP
# 129 "libavcodec/i386/fft_sse.c" 1
movaps (%edx), %xmm0
movaps %xmm4, %xmm2
movaps 16+(%edx), %xmm1
movaps %xmm5, %xmm3
mulps %xmm0, %xmm2
mulps %xmm1, %xmm3
mulps %xmm1, %xmm4
....
subps %xmm1, %xmm2
movaps %xmm0, %xmm1
shufps $0x88, %xmm2, %xmm0
shufps $0xdd, %xmm2, %xmm1
# 0 "" 2
.loc 1 115 0
# 115 "libavcodec/i386/fft_sse.c" 1
movaps 32(%eax), %xmm2
# 0 "" 2
.loc 1 116 0
# 116 "libavcodec/i386/fft_sse.c" 1
movaps 48(%eax), %xmm3
# 0 "" 2
.loc 1 117 0
#NO_APP
leal root2 at GOTOFF, %esi # 14 *lea_1 [length = 6]
leal root2mppm at GOTOFF, %ecx # 15 *lea_1 [length = 6]
#APP
# 117 "libavcodec/i386/fft_sse.c" 1
movaps %xmm2, %xmm4
shufps $0x44, %xmm3, %xmm2
shufps $0xee, %xmm3, %xmm4
movaps %xmm2, %xmm5
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I know you won't believe me, but the highest form of Human Excellence is
to question oneself and others. -- Socrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080729/0bdd08fc/attachment.pgp>
More information about the ffmpeg-devel
mailing list