[Ffmpeg-devel] [PATCH] SSE counterpart of ff_imdct_calc_3dn2

Wed Aug 23 10:46:49 CEST 2006

Hi

On Wed, Aug 23, 2006 at 03:32:19PM +0800, Zuxy Meng wrote:
> 2006/8/21, Loren Merritt <lorenm at u.washington.edu>:
> >If you can't make an sse version that's faster than C, have you tried mmx?
> >Just take the one from 3dn2 and change pswapd to pshufw.
> 
> Changing the last loop to SSE or MMX dosen't bring about significant
> speedup. Anyway I have this new patch for your review:-)
> 
> -- 
> Zuxy
> Beauty is truth,
> While truth is beauty.
> PGP KeyID: E8555ED6

[...]
> +    z += n8;
> +    asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1p1m1p1));
> +    for(k = 0; k < n8; k += 2) {
> +        /*
> +           Mnemonics:
> +           0 = z[k].re
> +           1 = z[k].im
> +           2 = z[k + 1].re
> +           3 = z[k + 1].im
> +           4 = z[-k - 2].re
> +           5 = z[-k - 2].im
> +           6 = z[-k - 1].re
> +           7 = z[-k - 1].im
> +        */
> +        asm volatile(
> +            "movaps          %0, %%xmm0 \n\t"   // xmm0 = 0 1 2 3
> +            "movaps          %1, %%xmm1 \n\t"   // xmm1 = 4 5 6 7
> +            ::"m"(z[k]), "m"(z[-2 - k])
> +        );
> +        asm (
> +            "movaps      %%xmm0, %%xmm2 \n\t"   // xmm2 = 0 1 2 3
> +            "shufps $141,%%xmm1, %%xmm0 \n\t"   // xmm0 = 1 3 4 6
> +            "shufps $216,%%xmm1, %%xmm2 \n\t"   // xmm2 = 0 2 5 7
> +            "shufps $156,%%xmm0, %%xmm0 \n\t"   // xmm0 = 1 6 3 4
> +            "shufps $156,%%xmm2, %%xmm2 \n\t"   // xmm2 = 0 7 2 5
> +            "movaps      %%xmm0, %%xmm1 \n\t"   // xmm1 = 1 6 3 4
> +            "xorps       %%xmm7, %%xmm2 \n\t"   // xmm2 = -0 7 -2 5 !
> +            "xorps       %%xmm7, %%xmm0 \n\t"   // xmm0 = -1 6 -3 4 !
> +            "shufps $27, %%xmm1, %%xmm1 \n\t"   // xmm1 = 4 3 6 1
> +            "movaps      %%xmm2, %%xmm3 \n\t"   // xmm3 = -0 7 -2 5
> +            "xorps       %%xmm7, %%xmm1 \n\t"   // xmm1 = -4 3 -6 1 !
> +            "shufps $27, %%xmm3, %%xmm3 \n\t"   // xmm3 = 5 -2 7 -0 !
> +            "movaps      %%xmm0, %0    \n\t"
> +            "movaps      %%xmm1, %1    \n\t"
> +            "movaps      %%xmm2, %2    \n\t"
> +            "movaps      %%xmm3, %3    \n\t"
> +            :"=m"(output[2*k]), "=m"(output[n2 - 4 - 2*k]),
> +             "=m"(output[n2 + 2*k]), "=m"(output[n - 4 - 2*k])
> +        );
> +    }

i think that can be done faster with:

            "movaps          %0, %%xmm0 \n\t"   // xmm0 = 0 1 2 3
            "movaps          %1, %%xmm1 \n\t"   // xmm1 = 4 5 6 7
            ::"m"(z[k]), "m"(z[-2 - k])
        );
        asm (
            "xorps       %%xmm7, %%xmm0 \n\t"   // xmm1 =-0-1-2-3
            "movaps      %%xmm0, %%xmm2 \n\t"   // xmm2 =-0-1-2-3
            "shufps $141,%%xmm1, %%xmm0 \n\t"   // xmm0 =-1-3 4 6
            "shufps $216,%%xmm1, %%xmm2 \n\t"   // xmm2 =-0-2 5 7
            "shufps $156,%%xmm0, %%xmm0 \n\t"   // xmm0 =-1 6-3 4 !
            "shufps $156,%%xmm2, %%xmm2 \n\t"   // xmm2 =-0 7-2 5 !
            "movaps      %%xmm0, %0    \n\t"
            "movaps      %%xmm2, %2    \n\t"
            "shufps $27, %%xmm0, %%xmm0 \n\t"   // xmm1 = 4 -3 6 -1
            "xorps       %%xmm7, %%xmm0 \n\t"   // xmm1 = -4 3 -6 1 !
            "shufps $27, %%xmm2, %%xmm2 \n\t"   // xmm3 = 5 -2 7 -0 !
            "movaps      %%xmm0, %1    \n\t"
            "movaps      %%xmm2, %3    \n\t"

or

            "movlps          %0, %%xmm0 \n\t"   // xmm0 = 0 1 X X 
            "movlps        8+%0, %%xmm1 \n\t"   // xmm1 = 2 3 X X
            "movhps        8+%1, %%xmm0 \n\t"   // xmm0 = 0 1 6 7
            "movhps          %1, %%xmm1 \n\t"   // xmm1 = 2 3 4 5
            ::"m"(z[k]), "m"(z[-2 - k])
        );
        asm (
            "xorps       %%xmm7, %%xmm0 \n\t"   // xmm1 =-0-1 6 7
            "xorps       %%xmm7, %%xmm1 \n\t"   // xmm1 =-2-3 4 5
            "movaps      %%xmm0, %%xmm2 \n\t"   // xmm2 =-0-1 6 7
            "shufps  $??,%%xmm1, %%xmm0 \n\t"   // xmm0 =-1 6-3 4 !
            "shufps  $??,%%xmm1, %%xmm2 \n\t"   // xmm2 =-0 7-2 5 !
            "movaps      %%xmm0, %0    \n\t"
            "movaps      %%xmm2, %2    \n\t"
            "shufps $27, %%xmm0, %%xmm0 \n\t"   // xmm1 = 4 -3 6 -1
            "xorps       %%xmm6, %%xmm0 \n\t"   // xmm1 = -4 3 -6 1 !
            "shufps $27, %%xmm2, %%xmm2 \n\t"   // xmm3 = 5 -2 7 -0 !
            "movaps      %%xmm0, %1    \n\t"
            "movaps      %%xmm2, %3    \n\t"

both untested, so maybe ive missed something silly

and of course should the 2 asms be merged, and the whole loop be done in
asm

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In the past you could go to a library and read, borrow or copy any book
Today you'd get arrested for mere telling someone where the library is