[Ffmpeg-devel] [PATCH] SSE counterpart of ff_imdct_calc_3dn2
Michael Niedermayer
michaelni
Wed Aug 23 10:46:49 CEST 2006
Hi
On Wed, Aug 23, 2006 at 03:32:19PM +0800, Zuxy Meng wrote:
> 2006/8/21, Loren Merritt <lorenm at u.washington.edu>:
> >If you can't make an sse version that's faster than C, have you tried mmx?
> >Just take the one from 3dn2 and change pswapd to pshufw.
>
> Changing the last loop to SSE or MMX dosen't bring about significant
> speedup. Anyway I have this new patch for your review:-)
>
> --
> Zuxy
> Beauty is truth,
> While truth is beauty.
> PGP KeyID: E8555ED6
[...]
> + z += n8;
> + asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1p1m1p1));
> + for(k = 0; k < n8; k += 2) {
> + /*
> + Mnemonics:
> + 0 = z[k].re
> + 1 = z[k].im
> + 2 = z[k + 1].re
> + 3 = z[k + 1].im
> + 4 = z[-k - 2].re
> + 5 = z[-k - 2].im
> + 6 = z[-k - 1].re
> + 7 = z[-k - 1].im
> + */
> + asm volatile(
> + "movaps %0, %%xmm0 \n\t" // xmm0 = 0 1 2 3
> + "movaps %1, %%xmm1 \n\t" // xmm1 = 4 5 6 7
> + ::"m"(z[k]), "m"(z[-2 - k])
> + );
> + asm (
> + "movaps %%xmm0, %%xmm2 \n\t" // xmm2 = 0 1 2 3
> + "shufps $141,%%xmm1, %%xmm0 \n\t" // xmm0 = 1 3 4 6
> + "shufps $216,%%xmm1, %%xmm2 \n\t" // xmm2 = 0 2 5 7
> + "shufps $156,%%xmm0, %%xmm0 \n\t" // xmm0 = 1 6 3 4
> + "shufps $156,%%xmm2, %%xmm2 \n\t" // xmm2 = 0 7 2 5
> + "movaps %%xmm0, %%xmm1 \n\t" // xmm1 = 1 6 3 4
> + "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -0 7 -2 5 !
> + "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -1 6 -3 4 !
> + "shufps $27, %%xmm1, %%xmm1 \n\t" // xmm1 = 4 3 6 1
> + "movaps %%xmm2, %%xmm3 \n\t" // xmm3 = -0 7 -2 5
> + "xorps %%xmm7, %%xmm1 \n\t" // xmm1 = -4 3 -6 1 !
> + "shufps $27, %%xmm3, %%xmm3 \n\t" // xmm3 = 5 -2 7 -0 !
> + "movaps %%xmm0, %0 \n\t"
> + "movaps %%xmm1, %1 \n\t"
> + "movaps %%xmm2, %2 \n\t"
> + "movaps %%xmm3, %3 \n\t"
> + :"=m"(output[2*k]), "=m"(output[n2 - 4 - 2*k]),
> + "=m"(output[n2 + 2*k]), "=m"(output[n - 4 - 2*k])
> + );
> + }
i think that can be done faster with:
"movaps %0, %%xmm0 \n\t" // xmm0 = 0 1 2 3
"movaps %1, %%xmm1 \n\t" // xmm1 = 4 5 6 7
::"m"(z[k]), "m"(z[-2 - k])
);
asm (
"xorps %%xmm7, %%xmm0 \n\t" // xmm1 =-0-1-2-3
"movaps %%xmm0, %%xmm2 \n\t" // xmm2 =-0-1-2-3
"shufps $141,%%xmm1, %%xmm0 \n\t" // xmm0 =-1-3 4 6
"shufps $216,%%xmm1, %%xmm2 \n\t" // xmm2 =-0-2 5 7
"shufps $156,%%xmm0, %%xmm0 \n\t" // xmm0 =-1 6-3 4 !
"shufps $156,%%xmm2, %%xmm2 \n\t" // xmm2 =-0 7-2 5 !
"movaps %%xmm0, %0 \n\t"
"movaps %%xmm2, %2 \n\t"
"shufps $27, %%xmm0, %%xmm0 \n\t" // xmm1 = 4 -3 6 -1
"xorps %%xmm7, %%xmm0 \n\t" // xmm1 = -4 3 -6 1 !
"shufps $27, %%xmm2, %%xmm2 \n\t" // xmm3 = 5 -2 7 -0 !
"movaps %%xmm0, %1 \n\t"
"movaps %%xmm2, %3 \n\t"
or
"movlps %0, %%xmm0 \n\t" // xmm0 = 0 1 X X
"movlps 8+%0, %%xmm1 \n\t" // xmm1 = 2 3 X X
"movhps 8+%1, %%xmm0 \n\t" // xmm0 = 0 1 6 7
"movhps %1, %%xmm1 \n\t" // xmm1 = 2 3 4 5
::"m"(z[k]), "m"(z[-2 - k])
);
asm (
"xorps %%xmm7, %%xmm0 \n\t" // xmm1 =-0-1 6 7
"xorps %%xmm7, %%xmm1 \n\t" // xmm1 =-2-3 4 5
"movaps %%xmm0, %%xmm2 \n\t" // xmm2 =-0-1 6 7
"shufps $??,%%xmm1, %%xmm0 \n\t" // xmm0 =-1 6-3 4 !
"shufps $??,%%xmm1, %%xmm2 \n\t" // xmm2 =-0 7-2 5 !
"movaps %%xmm0, %0 \n\t"
"movaps %%xmm2, %2 \n\t"
"shufps $27, %%xmm0, %%xmm0 \n\t" // xmm1 = 4 -3 6 -1
"xorps %%xmm6, %%xmm0 \n\t" // xmm1 = -4 3 -6 1 !
"shufps $27, %%xmm2, %%xmm2 \n\t" // xmm3 = 5 -2 7 -0 !
"movaps %%xmm0, %1 \n\t"
"movaps %%xmm2, %3 \n\t"
both untested, so maybe ive missed something silly
and of course should the 2 asms be merged, and the whole loop be done in
asm
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In the past you could go to a library and read, borrow or copy any book
Today you'd get arrested for mere telling someone where the library is
More information about the ffmpeg-devel
mailing list