Index: libavcodec/i386/fft_sse.c =================================================================== --- libavcodec/i386/fft_sse.c £¨ÐÞ¶©°æ 6032£© +++ libavcodec/i386/fft_sse.c £¨¹¤×÷¿½±´£© @@ -27,6 +27,9 @@ static const int p1p1m1m1[4] __attribute__((aligned(16))) = { 0, 0, 1 << 31, 1 << 31 }; +static const int p1m1p1m1[4] __attribute__((aligned(16))) = + { 0, 1 << 31, 0, 1 << 31 }; + #if 0 static void print_v4sf(const char *str, __m128 a) { @@ -118,3 +121,98 @@ } while (nblocks != 0); } +void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + long k, n8, n4, n2, n; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1)); + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 4; + + /* Complex multiplication + Two complex products per iteration, we could have 4 with 8 xmm + registers, 8 with 16 xmm registers. + Maybe we should unroll more. + */ + for (k = 0; k < n4; k += 2) { + asm volatile ( + "movaps %0, %%xmm0 \n\t" // xmm0 = r0 X r1 X : in2 + "movaps %1, %%xmm3 \n\t" // xmm3 = X i1 X i0: in1 + "movlps %2, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos + "movlps %3, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin + "shufps $95, %%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 + "shufps $160,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 + "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 + "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 + "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0 + "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR + "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0 + "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii + "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result + ::"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + /* Should be in the same block, hack for gcc2.95 & gcc3 */ + asm ( + "movlps %%xmm0, %0 \n\t" + "movhps %%xmm0, %1 \n\t" + :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]) + ); + } + + ff_fft_calc_sse(&s->fft, z); + + /* Not currently needed, added for safety */ + asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1)); + + /* post rotation + reordering */ + for (k = 0; k < n4; k += 2) { + asm ( + "movaps %0, %%xmm0 \n\t" // xmm0 = i1 r1 i0 r0: z + "movlps %1, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos + "movaps %%xmm0, %%xmm3 \n\t" // xmm3 = i1 r1 i0 r0 + "movlps %2, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin + "shufps $160,%%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 + "shufps $245,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 + "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 + "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 + "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0 + "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR + "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0 + "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii + "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result + "movaps %%xmm0, %0 \n\t" + :"+m"(z[k]) + :"m"(tcos[k]), "m"(tsin[k]) + ); + } + + z += n8; + /* XXX: Could be vectorized, but can't do better than the compiler */ + for(k = 0; k < n8; k++) { + output[2*k] = -z[k].im; + output[n2 - 1 - 2*k] = z[k].im; + + output[2*k + 1] = z[-k - 1].re; + output[n2 - 2 - 2*k] = -z[-k - 1].re; + + output[n2 + 2*k] = -z[k].re; + output[n - 1 - 2*k] = -z[k].re; + + output[n2 + 2*k + 1] = z[-k - 1].im; + output[n - 2 - 2*k] = z[-k - 1].im; + } +} Index: libavcodec/dsputil.h =================================================================== --- libavcodec/dsputil.h £¨ÐÞ¶©°æ 6032£© +++ libavcodec/dsputil.h £¨¹¤×÷¿½±´£© @@ -655,6 +655,8 @@ const FFTSample *input, FFTSample *tmp); void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input, FFTSample *tmp); +void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp); void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input, FFTSample *tmp); void ff_mdct_end(MDCTContext *s); Index: libavcodec/fft.c =================================================================== --- libavcodec/fft.c £¨ÐÞ¶©°æ 6032£© +++ libavcodec/fft.c £¨¹¤×÷¿½±´£© @@ -65,17 +65,18 @@ if (has_vectors) { #if defined(HAVE_MMX) - if (has_vectors & MM_3DNOWEXT) + if (has_vectors & MM_3DNOWEXT) { + /* 3DNowEx for K7/K8 */ s->imdct_calc = ff_imdct_calc_3dn2; - if (has_vectors & MM_3DNOWEXT) - /* 3DNowEx for K7/K8 */ s->fft_calc = ff_fft_calc_3dn2; - else if (has_vectors & MM_3DNOW) + } else if (has_vectors & MM_3DNOW) /* 3DNow! for K6-2/3 */ s->fft_calc = ff_fft_calc_3dn; - else if (has_vectors & MM_SSE) + else if (has_vectors & MM_SSE) { /* SSE for P3/P4 */ s->fft_calc = ff_fft_calc_sse; + s->imdct_calc = ff_imdct_calc_sse; + } #else /* HAVE_MMX */ if (has_vectors & MM_ALTIVEC) s->fft_calc = ff_fft_calc_altivec;