[FFmpeg-devel] [PATCH] split-radix FFT

Sat Jul 26 04:14:00 CEST 2008

$subject, vaguely based on djbfft.
Changed from djb:
* added simd.
* removed the hand-scheduled pentium-pro code. gcc's output from simple C 
is better on all cpus I have access to.
* removed the distinction between fft and ifft. they're just permutations 
of eachother, so the difference belongs in revtab[] and not in the code.
* removed the distinction between pass() and pass_big(). C can always use 
the memory-efficient version, and simd never does because the shuffles are 
too costly.
* made an entirely different pass_big(), to avoid store->load aliasing.

I tried the tangent FFT, but I couldn't make it faster than split-radix. 
Tangent has asymptotically 5% fewer arithmetic ops, but only 1-2% for 
sizes typical of audio codecs, and even a couple extra shuffles or other 
overhead pushes it over.

I tried an in-place fft_permute, but it wasn't any faster than out-of-place 
+ memcpy, and quite a bit more complex.


benchmarks (cycles):

2^4  2^5  2^6   2^7  2^8   2^9    2^10  2^11   2^12    2^13    2^14     2^15     2^16 fft size

368 1204 2925  6958 15864 35742  79907 176930 391877  954522 2582623  7027105 15979384 k8.svn.c
201  641 1666  4168  9730 22360  49814 111063 245835  551301 2140560  5449270 14161100 k8.djb.c
194  627 1656  4169  9923 22881  51998 116276 257871  577594 1983282  4926132 12508115 k8.split.c
148  526 1546  3537  7757 17209  38176  85052 189656  461192 1180269  2665751  6856414 k8.svn.sse
122  459 1380  3111  6687 14719  32318  71412 165771  418068 1112945  2575784  6684035 k8.svn.3dn2
173  478 1208  2832  6397 14097  30816  66861 145040  327352 1163446  2907148  7465755 k8.split.3dn2
114  363  934  2206  5190 11527  25515  55826 122329  272338  821973  1974221  4931610 k8.split.sse

298  870 2196  4998 11462 25954  58196 129402 290652  652258 1515451  3379696  7337748 conroe.svn.c
138  396 1016  2453  5703 13080  29542  72980 166937  382352  852779  1965809  4289844 conroe.djb.c
132  383  959  2443  5691 12957  29167  64625 141885  310046  670222  1449479  3131753 conroe.split.c
114  390  926  1978  4112  8825  19316  42252  94760  214178  462689   994594  2200585 conroe.svn.sse
  82  216  498  1221  2630  5715  12265  26415  58853  126274  269532   568314  1220458 conroe.split.sse

318  932 2430  5578 12857 29093  65321 145709 327347  745524 1763483  3977390  8756696 penryn.svn.c
129  420 1146  2824  6670 15907  35248  87700 199251  454614 1015788  2309352  5068660 penryn.djb.c
151  424 1094  2746  6369 14604  32945  73085 161065  354042  768205  1672258  3601885 penryn.split.c
  72  335  834  1776  3626  7733  17273  38134  88629  204577  449889   982217  2171035 penryn.svn.sse
  36  144  418  1054  2343  5127  11289  24670  56975  124178  268514   578586  1262736 penryn.split.sse

729 1869 4909 12412 28876 65702 147951 337286 786350 1770779 3988037 10739966 42763027 prescott.svn.c
463 1224 3179  7644 18052 41395  91632 206204 450451  985720 2173172  4685558 10307042 prescott.djb.c
434 1155 2894  6963 16356 37468  87928 202304 457022 1012743 2257329  5002317 10999607 prescott.split.c
222  522 1208  2909  6858 15517  34318  83046 193854  465508 1221169  5221734 17064482 prescott.svn.sse
124  370  850  2003  4377 10050  22798  51432 113311  254526  590618  1548052  3930814 prescott.split.sse

--Loren Merritt
-------------- next part --------------
---
 libavcodec/dsputil.h       |   24 +++-
 libavcodec/fft.c           |  351 +++++++++++++++++++++-------------
 libavcodec/fft_template.h  |   53 +++++
 libavcodec/i386/fft_3dn.c  |  111 +-----------
 libavcodec/i386/fft_3dn2.c |  283 +++++++++++++++++++---------
 libavcodec/i386/fft_sse.c  |  452 ++++++++++++++++++++++++++++++++++----------
 6 files changed, 840 insertions(+), 434 deletions(-)
 create mode 100644 libavcodec/fft_template.h

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 7a47b87..cf1a467 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -638,6 +638,8 @@ typedef struct FFTContext {
     uint16_t *revtab;
     FFTComplex *exptab;
     FFTComplex *exptab1; /* only used by SSE code */
+    FFTComplex *tmp_buf;
+    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
     void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
     void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
                        const FFTSample *input, FFTSample *tmp);
@@ -646,13 +648,18 @@ typedef struct FFTContext {
 } FFTContext;
 
 int ff_fft_init(FFTContext *s, int nbits, int inverse);
-void ff_fft_permute(FFTContext *s, FFTComplex *z);
+void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
 
+static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
+{
+    s->fft_permute(s, z);
+}
 static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
 {
     s->fft_calc(s, z);
@@ -670,6 +677,21 @@ typedef struct MDCTContext {
     FFTContext fft;
 } MDCTContext;
 
+/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
+extern FFTSample ff_cos_16[8];
+extern FFTSample ff_cos_32[16];
+extern FFTSample ff_cos_64[32];
+extern FFTSample ff_cos_128[64];
+extern FFTSample ff_cos_256[128];
+extern FFTSample ff_cos_512[256];
+extern FFTSample ff_cos_1024[512];
+extern FFTSample ff_cos_2048[1024];
+extern FFTSample ff_cos_4096[2048];
+extern FFTSample ff_cos_8192[4096];
+extern FFTSample ff_cos_16384[8192];
+extern FFTSample ff_cos_32768[16384];
+extern FFTSample ff_cos_65536[32768];
+
 /**
  * Generate a Kaiser-Bessel Derived Window.
  * @param   window  pointer to half window
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index 47e9e06..2c1bb8c 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -1,6 +1,8 @@
 /*
  * FFT/IFFT transforms
+ * Copyright (c) 2008 Loren Merritt
  * Copyright (c) 2002 Fabrice Bellard.
+ * Partly based on libdjbfft, Copyright (c) 1999 D. J. Bernstein
  *
  * This file is part of FFmpeg.
  *
@@ -26,6 +28,38 @@
 
 #include "dsputil.h"
 
+DECLARE_ALIGNED_16(FFTSample, ff_cos_16[8]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_32[16]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_64[32]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_128[64]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_256[128]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_512[256]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_1024[512]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_2048[1024]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_4096[2048]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_8192[4096]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_16384[8192]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_32768[16384]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_65536[32768]);
+static FFTSample *ff_cos_tabs[] = {
+    ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024,
+    ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536,
+};
+
+static int split_radix_permutation(int i, int n, int inverse)
+{
+    int m;
+    if(n <= 2) return i;
+    m = n >> 1;
+    if(i < m) return split_radix_permutation(i,m,inverse) << 1;
+    i -= m;
+    m >>= 1;
+    if(!inverse) i ^= m;
+    if(i < m) return (split_radix_permutation(i,m,inverse) << 2) + 1;
+    i -= m;
+    return ((split_radix_permutation(i,m,inverse) << 2) - 1) & (n - 1);
+}
+
 /**
  * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is
  * done
@@ -34,12 +68,15 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
     int i, j, m, n;
     float alpha, c1, s1, s2;
-    int shuffle = 0;
+    int split_radix = 1;
     int av_unused has_vectors;
 
+    if (nbits < 2 || nbits > 16)
+        goto fail;
     s->nbits = nbits;
     n = 1 << nbits;
 
+    s->tmp_buf = NULL;
     s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
     if (!s->exptab)
         goto fail;
@@ -50,13 +87,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
     s2 = inverse ? 1.0 : -1.0;
 
-    for(i=0;i<(n/2);i++) {
-        alpha = 2 * M_PI * (float)i / (float)n;
-        c1 = cos(alpha);
-        s1 = sin(alpha) * s2;
-        s->exptab[i].re = c1;
-        s->exptab[i].im = s1;
-    }
+    s->fft_permute = ff_fft_permute_c;
     s->fft_calc = ff_fft_calc_c;
     s->imdct_calc = ff_imdct_calc;
     s->imdct_half = ff_imdct_half;
@@ -64,36 +95,55 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
 
 #ifdef HAVE_MMX
     has_vectors = mm_support();
-    shuffle = 1;
-    if (has_vectors & MM_3DNOWEXT) {
-        /* 3DNowEx for K7/K8 */
+    if (has_vectors & MM_SSE) {
+        /* SSE for P3/P4/K8 */
+        s->imdct_calc = ff_imdct_calc_sse;
+        s->imdct_half = ff_imdct_half_sse;
+        s->fft_permute = ff_fft_permute_sse;
+        s->fft_calc = ff_fft_calc_sse;
+    } else if (has_vectors & MM_3DNOWEXT) {
+        /* 3DNowEx for K7 */
         s->imdct_calc = ff_imdct_calc_3dn2;
         s->imdct_half = ff_imdct_half_3dn2;
         s->fft_calc = ff_fft_calc_3dn2;
     } else if (has_vectors & MM_3DNOW) {
         /* 3DNow! for K6-2/3 */
         s->fft_calc = ff_fft_calc_3dn;
-    } else if (has_vectors & MM_SSE) {
-        /* SSE for P3/P4 */
-        s->imdct_calc = ff_imdct_calc_sse;
-        s->imdct_half = ff_imdct_half_sse;
-        s->fft_calc = ff_fft_calc_sse;
-    } else {
-        shuffle = 0;
     }
 #elif defined HAVE_ALTIVEC && !defined ALTIVEC_USE_REFERENCE_C_CODE
     has_vectors = mm_support();
     if (has_vectors & MM_ALTIVEC) {
         s->fft_calc = ff_fft_calc_altivec;
-        shuffle = 1;
+        split_radix = 0;
     }
 #endif
 
     /* compute constant table for HAVE_SSE version */
-    if (shuffle) {
+    if (split_radix) {
+        for(j=4; j<=nbits; j++) {
+            int m = 1<<j;
+            double freq = 2*M_PI/m;
+            FFTSample *tab = ff_cos_tabs[j-4];
+            for(i=0; i<=m/4; i++)
+                tab[i] = cos(i*freq);
+            for(i=1; i<m/4; i++)
+                tab[m/2-i] = tab[i];
+        }
+        for(i=0; i<n; i++)
+            s->revtab[(n - split_radix_permutation(i, n, s->inverse)) % n] = i;
+        s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
+    } else {
         int np, nblocks, np2, l;
         FFTComplex *q;
 
+        for(i=0; i<(n/2); i++) {
+            alpha = 2 * M_PI * (float)i / (float)n;
+            c1 = cos(alpha);
+            s1 = sin(alpha) * s2;
+            s->exptab[i].re = c1;
+            s->exptab[i].im = s1;
+        }
+
         np = 1 << nbits;
         nblocks = np >> 3;
         np2 = np >> 1;
@@ -116,137 +166,43 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
             nblocks = nblocks >> 1;
         } while (nblocks != 0);
         av_freep(&s->exptab);
-    }
 
-    /* compute bit reverse table */
-
-    for(i=0;i<n;i++) {
-        m=0;
-        for(j=0;j<nbits;j++) {
-            m |= ((i >> j) & 1) << (nbits-j-1);
+        for(i=0;i<n;i++) {
+            m=0;
+            for(j=0;j<nbits;j++) {
+                m |= ((i >> j) & 1) << (nbits-j-1);
+            }
+            s->revtab[i]=m;
         }
-        s->revtab[i]=m;
     }
+
     return 0;
  fail:
     av_freep(&s->revtab);
     av_freep(&s->exptab);
     av_freep(&s->exptab1);
+    av_freep(&s->tmp_buf);
     return -1;
 }
 
-/* butter fly op */
-#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
-{\
-  FFTSample ax, ay, bx, by;\
-  bx=pre1;\
-  by=pim1;\
-  ax=qre1;\
-  ay=qim1;\
-  pre = (bx + ax);\
-  pim = (by + ay);\
-  qre = (bx - ax);\
-  qim = (by - ay);\
-}
-
-#define MUL16(a,b) ((a) * (b))
-
-#define CMUL(pre, pim, are, aim, bre, bim) \
-{\
-   pre = (MUL16(are, bre) - MUL16(aim, bim));\
-   pim = (MUL16(are, bim) + MUL16(bre, aim));\
-}
-
-/**
- * Do a complex FFT with the parameters defined in ff_fft_init(). The
- * input data must be permuted before with s->revtab table. No
- * 1.0/sqrt(n) normalization is done.
- */
-void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
-{
-    int ln = s->nbits;
-    int j, np, np2;
-    int nblocks, nloops;
-    register FFTComplex *p, *q;
-    FFTComplex *exptab = s->exptab;
-    int l;
-    FFTSample tmp_re, tmp_im;
-
-    np = 1 << ln;
-
-    /* pass 0 */
-
-    p=&z[0];
-    j=(np >> 1);
-    do {
-        BF(p[0].re, p[0].im, p[1].re, p[1].im,
-           p[0].re, p[0].im, p[1].re, p[1].im);
-        p+=2;
-    } while (--j != 0);
-
-    /* pass 1 */
-
-
-    p=&z[0];
-    j=np >> 2;
-    if (s->inverse) {
-        do {
-            BF(p[0].re, p[0].im, p[2].re, p[2].im,
-               p[0].re, p[0].im, p[2].re, p[2].im);
-            BF(p[1].re, p[1].im, p[3].re, p[3].im,
-               p[1].re, p[1].im, -p[3].im, p[3].re);
-            p+=4;
-        } while (--j != 0);
-    } else {
-        do {
-            BF(p[0].re, p[0].im, p[2].re, p[2].im,
-               p[0].re, p[0].im, p[2].re, p[2].im);
-            BF(p[1].re, p[1].im, p[3].re, p[3].im,
-               p[1].re, p[1].im, p[3].im, -p[3].re);
-            p+=4;
-        } while (--j != 0);
-    }
-    /* pass 2 .. ln-1 */
-
-    nblocks = np >> 3;
-    nloops = 1 << 2;
-    np2 = np >> 1;
-    do {
-        p = z;
-        q = z + nloops;
-        for (j = 0; j < nblocks; ++j) {
-            BF(p->re, p->im, q->re, q->im,
-               p->re, p->im, q->re, q->im);
-
-            p++;
-            q++;
-            for(l = nblocks; l < np2; l += nblocks) {
-                CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
-                BF(p->re, p->im, q->re, q->im,
-                   p->re, p->im, tmp_re, tmp_im);
-                p++;
-                q++;
-            }
-
-            p += nloops;
-            q += nloops;
-        }
-        nblocks = nblocks >> 1;
-        nloops = nloops << 1;
-    } while (nblocks != 0);
-}
-
 /**
  * Do the permutation needed BEFORE calling ff_fft_calc()
  */
-void ff_fft_permute(FFTContext *s, FFTComplex *z)
+void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
 {
     int j, k, np;
     FFTComplex tmp;
     const uint16_t *revtab = s->revtab;
+    np = 1 << s->nbits;
+
+    if (s->tmp_buf) {
+        /* TODO: handle split-radix permute in a more optimal way, probably in-place */
+        for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+        memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
+        return;
+    }
 
     /* reverse */
-    np = 1 << s->nbits;
     for(j=0;j<np;j++) {
         k = revtab[j];
         if (k < j) {
@@ -262,5 +218,138 @@ void ff_fft_end(FFTContext *s)
     av_freep(&s->revtab);
     av_freep(&s->exptab);
     av_freep(&s->exptab1);
+    av_freep(&s->tmp_buf);
+}
+
+#define sqrthalf (float)M_SQRT1_2
+
+#define BF(x,y,a,b) {\
+    x = a - b;\
+    y = a + b;\
+}
+
+#define BUTTERFLIES(a0,a1,a2,a3) {\
+    BF(t3, t5, t5, t1);\
+    BF(a2.re, a0.re, a0.re, t5);\
+    BF(a3.im, a1.im, a1.im, t3);\
+    BF(t4, t6, t2, t6);\
+    BF(a3.re, a1.re, a1.re, t4);\
+    BF(a2.im, a0.im, a0.im, t6);\
+}
+
+// force loading all the inputs before storing any.
+// this is slightly slower for small data, but avoids store->load aliasing
+// for addresses separated by large powers of 2.
+#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
+    FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
+    BF(t3, t5, t5, t1);\
+    BF(a2.re, a0.re, r0, t5);\
+    BF(a3.im, a1.im, i1, t3);\
+    BF(t4, t6, t2, t6);\
+    BF(a3.re, a1.re, r1, t4);\
+    BF(a2.im, a0.im, i0, t6);\
+}
+
+#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
+    t1 = a2.re * wre + a2.im * wim;\
+    t2 = a2.im * wre - a2.re * wim;\
+    t5 = a3.re * wre - a3.im * wim;\
+    t6 = a3.im * wre + a3.re * wim;\
+    BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
+    t1 = a2.re;\
+    t2 = a2.im;\
+    t5 = a3.re;\
+    t6 = a3.im;\
+    BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+static void fft4(FFTComplex *z)
+{
+    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
+
+    BF(t3, t1, z[0].re, z[1].re);
+    BF(t8, t6, z[3].re, z[2].re);
+    BF(z[2].re, z[0].re, t1, t6);
+    BF(t4, t2, z[0].im, z[1].im);
+    BF(t7, t5, z[2].im, z[3].im);
+    BF(z[3].im, z[1].im, t4, t8);
+    BF(z[3].re, z[1].re, t3, t7);
+    BF(z[2].im, z[0].im, t2, t5);
+}
+
+static void fft8(FFTComplex *z)
+{
+    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
+
+    fft4(z);
+
+    BF(t1, z[5].re, z[4].re, -z[5].re);
+    BF(t2, z[5].im, z[4].im, -z[5].im);
+    BF(t3, z[7].re, z[6].re, -z[7].re);
+    BF(t4, z[7].im, z[6].im, -z[7].im);
+    BF(t8, t1, t3, t1);
+    BF(t7, t2, t2, t4);
+    BF(z[4].re, z[0].re, z[0].re, t1);
+    BF(z[4].im, z[0].im, z[0].im, t2);
+    BF(z[6].re, z[2].re, z[2].re, t7);
+    BF(z[6].im, z[2].im, z[2].im, t8);
+
+    TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
+}
+
+static void fft16(FFTComplex *z)
+{
+    FFTSample t1, t2, t3, t4, t5, t6;
+
+    fft8(z);
+    fft4(z+8);
+    fft4(z+12);
+
+    TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
+    TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
+    TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]);
+    TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]);
+}
+
+/* z[0...8n-1], w[1...2n-1] */
+#define PASS(name)\
+static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
+{\
+    FFTSample t1, t2, t3, t4, t5, t6;\
+    int o1 = 2*n;\
+    int o2 = 4*n;\
+    int o3 = 6*n;\
+    const FFTSample *wim = wre+o1;\
+    n--;\
+\
+    TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
+    TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
+    do {\
+        z += 2;\
+        wre += 2;\
+        wim -= 2;\
+        TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
+        TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
+    } while(--n);\
+}
+
+PASS(pass)
+#undef BUTTERFLIES
+#define BUTTERFLIES BUTTERFLIES_BIG
+PASS(pass_big)
+
+#include "fft_template.h"
+
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init(). The
+ * input data must be permuted before with s->revtab table. No
+ * 1.0/sqrt(n) normalization is done.
+ */
+void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
+{
+    fft_dispatch[s->nbits-2](z);
 }
 
diff --git a/libavcodec/fft_template.h b/libavcodec/fft_template.h
new file mode 100644
index 0000000..a4c54b0
--- /dev/null
+++ b/libavcodec/fft_template.h
@@ -0,0 +1,53 @@
+/**
+ * @file fft_template.h
+ * FFT/IFFT transforms
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define DECL_FFT(n,n2,n4,suffix)\
+static void fft##n##suffix(FFTComplex *z)\
+{\
+    fft##n2(z);\
+    fft##n4(z+n4*2);\
+    fft##n4(z+n4*3);\
+    pass##suffix(z,ff_cos_##n,n4/2);\
+}
+
+DECL_FFT(32,16,8,)
+DECL_FFT(64,32,16,)
+DECL_FFT(128,64,32,)
+DECL_FFT(256,128,64,)
+DECL_FFT(512,256,128,)
+#ifdef BUTTERFLIES_BIG
+#define pass pass_big
+#endif
+DECL_FFT(1024,512,256,)
+DECL_FFT(2048,1024,512,)
+DECL_FFT(4096,2048,1024,)
+DECL_FFT(8192,4096,2048,)
+DECL_FFT(16384,8192,4096,)
+DECL_FFT(32768,16384,8192,)
+DECL_FFT(65536,32768,16384,)
+#undef pass
+
+static void (*fft_dispatch[])(FFTComplex*) = {
+    fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
+    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
+};
+
diff --git a/libavcodec/i386/fft_3dn.c b/libavcodec/i386/fft_3dn.c
index 8bd7b89..6f2e2e8 100644
--- a/libavcodec/i386/fft_3dn.c
+++ b/libavcodec/i386/fft_3dn.c
@@ -1,7 +1,6 @@
 /*
  * FFT/MDCT transform with 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
- * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2008 Loren Merritt
  *
  * This file is part of FFmpeg.
  *
@@ -20,109 +19,5 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-
-static const int p1m1[2] __attribute__((aligned(8))) =
-    { 0, 1 << 31 };
-
-static const int m1p1[2] __attribute__((aligned(8))) =
-    { 1 << 31, 0 };
-
-void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
-{
-    int ln = s->nbits;
-    long j;
-    x86_reg i;
-    long nblocks, nloops;
-    FFTComplex *p, *cptr;
-
-    asm volatile(
-        /* FEMMS is not a must here but recommended by AMD */
-        "femms \n\t"
-        "movq %0, %%mm7 \n\t"
-        ::"m"(*(s->inverse ? m1p1 : p1m1))
-    );
-
-    i = 8 << ln;
-    asm volatile(
-        "1: \n\t"
-        "sub $32, %0 \n\t"
-        "movq    (%0,%1), %%mm0 \n\t"
-        "movq  16(%0,%1), %%mm1 \n\t"
-        "movq   8(%0,%1), %%mm2 \n\t"
-        "movq  24(%0,%1), %%mm3 \n\t"
-        "movq      %%mm0, %%mm4 \n\t"
-        "movq      %%mm1, %%mm5 \n\t"
-        "pfadd     %%mm2, %%mm0 \n\t"
-        "pfadd     %%mm3, %%mm1 \n\t"
-        "pfsub     %%mm2, %%mm4 \n\t"
-        "pfsub     %%mm3, %%mm5 \n\t"
-        "movq      %%mm0, %%mm2 \n\t"
-        "punpckldq %%mm5, %%mm6 \n\t"
-        "punpckhdq %%mm6, %%mm5 \n\t"
-        "movq      %%mm4, %%mm3 \n\t"
-        "pxor      %%mm7, %%mm5 \n\t"
-        "pfadd     %%mm1, %%mm0 \n\t"
-        "pfadd     %%mm5, %%mm4 \n\t"
-        "pfsub     %%mm1, %%mm2 \n\t"
-        "pfsub     %%mm5, %%mm3 \n\t"
-        "movq      %%mm0,   (%0,%1) \n\t"
-        "movq      %%mm4,  8(%0,%1) \n\t"
-        "movq      %%mm2, 16(%0,%1) \n\t"
-        "movq      %%mm3, 24(%0,%1) \n\t"
-        "jg 1b \n\t"
-        :"+r"(i)
-        :"r"(z)
-    );
-    /* pass 2 .. ln-1 */
-
-    nblocks = 1 << (ln-3);
-    nloops = 1 << 2;
-    cptr = s->exptab1;
-    do {
-        p = z;
-        j = nblocks;
-        do {
-            i = nloops*8;
-            asm volatile(
-                "1: \n\t"
-                "sub $16, %0 \n\t"
-                "movq    (%1,%0), %%mm0 \n\t"
-                "movq   8(%1,%0), %%mm1 \n\t"
-                "movq    (%2,%0), %%mm2 \n\t"
-                "movq   8(%2,%0), %%mm3 \n\t"
-                "movq      %%mm2, %%mm4 \n\t"
-                "movq      %%mm3, %%mm5 \n\t"
-                "punpckldq %%mm2, %%mm2 \n\t"
-                "punpckldq %%mm3, %%mm3 \n\t"
-                "punpckhdq %%mm4, %%mm4 \n\t"
-                "punpckhdq %%mm5, %%mm5 \n\t"
-                "pfmul   (%3,%0,2), %%mm2 \n\t" //  cre*re cim*re
-                "pfmul  8(%3,%0,2), %%mm3 \n\t"
-                "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
-                "pfmul 24(%3,%0,2), %%mm5 \n\t"
-                "pfadd     %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
-                "pfadd     %%mm3, %%mm5 \n\t"
-                "movq      %%mm0, %%mm2 \n\t"
-                "movq      %%mm1, %%mm3 \n\t"
-                "pfadd     %%mm4, %%mm0 \n\t"
-                "pfadd     %%mm5, %%mm1 \n\t"
-                "pfsub     %%mm4, %%mm2 \n\t"
-                "pfsub     %%mm5, %%mm3 \n\t"
-                "movq      %%mm0,  (%1,%0) \n\t"
-                "movq      %%mm1, 8(%1,%0) \n\t"
-                "movq      %%mm2,  (%2,%0) \n\t"
-                "movq      %%mm3, 8(%2,%0) \n\t"
-                "jg 1b \n\t"
-                :"+r"(i)
-                :"r"(p), "r"(p + nloops), "r"(cptr)
-            );
-            p += nloops*2;
-        } while (--j);
-        cptr += nloops*2;
-        nblocks >>= 1;
-        nloops <<= 1;
-    } while (nblocks != 0);
-    asm volatile("femms");
-}
+#define EMULATE_3DNOWEXT
+#include "fft_3dn2.c"
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c
index 9068dff..42ac691 100644
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -1,7 +1,6 @@
 /*
  * FFT/MDCT transform with Extended 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
- * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt
  *
  * This file is part of FFmpeg.
  *
@@ -23,105 +22,209 @@
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/dsputil.h"
 
-static const int p1m1[2] __attribute__((aligned(8))) =
-    { 0, 1 << 31 };
+DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
+DECLARE_ALIGNED_8(static const int, m1p1[2]) = { 1<<31, 0 };
+DECLARE_ALIGNED_8(static const FFTSample, root2[2]) = { M_SQRT1_2, M_SQRT1_2 };
 
-static const int m1p1[2] __attribute__((aligned(8))) =
-    { 1 << 31, 0 };
+#ifdef EMULATE_3DNOWEXT
+#define PSWAPD(s,d)\
+    "movq "#s","#d"\n"\
+    "psrlq $32,"#d"\n"\
+    "punpckldq "#s","#d"\n"
+#define PSWAPD_UNARY(s)\
+    "sub $8, %%"REG_SP"\n"\
+    "movd "#s", 4(%%"REG_SP")\n"\
+    "punpckhdq (%%"REG_SP"), "#s"\n"\
+    "add $8, %%"REG_SP"\n"
+#define ff_fft_calc_3dn2 ff_fft_calc_3dn
+#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
+#define ff_imdct_half_3dn2 ff_imdct_half_3dn
+#else
+#define PSWAPD(s,d)     "pswapd "#s","#d"\n"
+#define PSWAPD_UNARY(s) "pswapd "#s","#s"\n"
+#endif
 
-void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
+#define T2(m0,m1,z0,z1)\
+    asm volatile(\
+        "movq     %0,  "#z0" \n"\
+        "movq   "#z0", "#z1" \n"\
+        "pfadd    %1,  "#z0" \n"\
+        "pfsub    %1,  "#z1" \n"\
+        ::"m"(m0), "m"(m1)\
+    );
+
+#define T4(z0,z1,z2,z3,t0,t1)\
+    asm volatile(\
+        "movq   "#z2", "#t1" \n" /* FIXME redundant */\
+        "movq   "#z2", "#t0" \n" /* {r2,i2} */\
+        "pfsub  "#z3", "#t1" \n"\
+        "pfadd  "#z3", "#t0" \n" /* {t6,t5} */\
+        "pxor     %0,  "#t1" \n" /* {t8,t7} */\
+        "movq   "#z0", "#z2" \n"\
+        PSWAPD_UNARY(t1)\
+        "pfadd  "#t0", "#z0" \n" /* {r0,i0} */\
+        "pfsub  "#t0", "#z2" \n" /* {r2,i2} */\
+        "movq   "#z1", "#z3" \n"\
+        "pfadd  "#t1", "#z1" \n" /* {r1,i1} */\
+        "pfsub  "#t1", "#z3" \n" /* {r3,i3} */\
+        ::"m"(*m1p1)\
+    );
+
+#define LOAD(mem,mmx)\
+    asm volatile("movq %0, "#mmx ::"m"(mem))
+
+#define SAVE(mem,mmx)\
+    asm volatile("movq "#mmx", %0" :"=m"(mem))
+
+#define PUNPCK(a,b,c)\
+    asm volatile(\
+        "movq      "#a", "#c"\n"\
+        "punpckldq "#b", "#a"\n"\
+        "punpckhdq "#b", "#c"\n"\
+    :);
+
+#define PUNPCK_MEM(a,b,c)\
+    asm volatile(\
+        "movq      "#a", "#c"\n"\
+        "punpckldq  %0,  "#a"\n"\
+        "punpckhdq  %0,  "#c"\n"\
+        ::"m"(b)\
+    );
+
+static void fft4(FFTComplex *z)
 {
-    int ln = s->nbits;
-    long j;
-    x86_reg i;
-    long nblocks, nloops;
-    FFTComplex *p, *cptr;
+    T2(z[0], z[1], %%mm0, %%mm1);
+    LOAD(z[2], %%mm2);
+    LOAD(z[3], %%mm3);
+    T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
+    PUNPCK(%%mm0, %%mm1, %%mm4);
+    PUNPCK(%%mm2, %%mm3, %%mm5);
+    SAVE(z[0], %%mm0);
+    SAVE(z[1], %%mm4);
+    SAVE(z[2], %%mm2);
+    SAVE(z[3], %%mm5);
+}
 
+static void fft8(FFTComplex *z)
+{
+    T2(z[0], z[1], %%mm0, %%mm1);
+    LOAD(z[2], %%mm2);
+    LOAD(z[3], %%mm3);
+    T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
+    SAVE(z[0], %%mm0);
+    SAVE(z[2], %%mm2);
+    T2(z[4], z[5], %%mm4, %%mm5);
+    T2(z[6], z[7], %%mm6, %%mm7);
     asm volatile(
-        /* FEMMS is not a must here but recommended by AMD */
-        "femms \n\t"
-        "movq %0, %%mm7 \n\t"
-        ::"m"(*(s->inverse ? m1p1 : p1m1))
+        PSWAPD( %%mm5, %%mm0 )
+        PSWAPD( %%mm7, %%mm2 )
+        "pxor   %0,    %%mm0 \n"
+        "pxor   %0,    %%mm2 \n"
+        "pfsub  %%mm0, %%mm5 \n"
+        "pfadd  %%mm2, %%mm7 \n"
+        "pfmul  %1,    %%mm5 \n"
+        "pfmul  %1,    %%mm7 \n"
+        ::"m"(*m1p1), "m"(*root2)
     );
+    T4(%%mm1, %%mm3, %%mm5, %%mm7, %%mm0, %%mm2);
+    SAVE(z[5], %%mm5);
+    SAVE(z[7], %%mm7);
+    LOAD(z[0], %%mm0);
+    LOAD(z[2], %%mm2);
+    T4(%%mm0, %%mm2, %%mm4, %%mm6, %%mm5, %%mm7);
+    PUNPCK(%%mm0, %%mm1, %%mm5);
+    PUNPCK(%%mm2, %%mm3, %%mm7);
+    SAVE(z[0], %%mm0);
+    SAVE(z[1], %%mm5);
+    SAVE(z[2], %%mm2);
+    SAVE(z[3], %%mm7);
+    PUNPCK_MEM(%%mm4, z[5], %%mm5);
+    PUNPCK_MEM(%%mm6, z[7], %%mm7);
+    SAVE(z[4], %%mm4);
+    SAVE(z[5], %%mm5);
+    SAVE(z[6], %%mm6);
+    SAVE(z[7], %%mm7);
+}
 
-    i = 8 << ln;
+/* z[0...8n-1], w[1...2n-1] */
+static av_noinline void pass(FFTComplex *z, const FFTSample *w, uintptr_t n)
+{
+    // FIXME: AMD has the same store->load aliasing issue as Intel, just with
+    // a larger threshold (64KB rather than 4KB). This is high enough that I
+    // haven't bothered to make two versions of pass(), but it would be faster
+    // if someone cares about FFT of size >=2^14.
     asm volatile(
-        "1: \n\t"
-        "sub $32, %0 \n\t"
-        "movq    (%0,%1), %%mm0 \n\t"
-        "movq  16(%0,%1), %%mm1 \n\t"
-        "movq   8(%0,%1), %%mm2 \n\t"
-        "movq  24(%0,%1), %%mm3 \n\t"
-        "movq      %%mm0, %%mm4 \n\t"
-        "movq      %%mm1, %%mm5 \n\t"
-        "pfadd     %%mm2, %%mm0 \n\t"
-        "pfadd     %%mm3, %%mm1 \n\t"
-        "pfsub     %%mm2, %%mm4 \n\t"
-        "pfsub     %%mm3, %%mm5 \n\t"
-        "movq      %%mm0, %%mm2 \n\t"
-        "pswapd    %%mm5, %%mm5 \n\t"
-        "movq      %%mm4, %%mm3 \n\t"
-        "pxor      %%mm7, %%mm5 \n\t"
-        "pfadd     %%mm1, %%mm0 \n\t"
-        "pfadd     %%mm5, %%mm4 \n\t"
-        "pfsub     %%mm1, %%mm2 \n\t"
-        "pfsub     %%mm5, %%mm3 \n\t"
-        "movq      %%mm0,   (%0,%1) \n\t"
-        "movq      %%mm4,  8(%0,%1) \n\t"
-        "movq      %%mm2, 16(%0,%1) \n\t"
-        "movq      %%mm3, 24(%0,%1) \n\t"
-        "jg 1b \n\t"
-        :"+r"(i)
-        :"r"(z)
+        "1: \n"
+        "movq   (%0,%3,4), %%mm4 \n" // r2
+        "movq  8(%0,%3,4), %%mm5 \n" // i2
+        "movq       %%mm4, %%mm2 \n"
+        "movq        (%1), %%mm0 \n" // wre
+        "movq       %%mm5, %%mm3 \n"
+        "movq     (%1,%3), %%mm1 \n" // wim
+        "pfmul      %%mm0, %%mm2 \n" // r2*wre
+        "movq     (%0,%4), %%mm6 \n" // r3
+        "pfmul      %%mm1, %%mm3 \n" // i2*wim
+        "movq    8(%0,%4), %%mm7 \n" // i3
+        "pfmul      %%mm1, %%mm4 \n" // r2*wim
+        "pfmul      %%mm0, %%mm5 \n" // i2*wre
+        "pfadd      %%mm3, %%mm2 \n" // r2*wre + i2*wim
+        "movq       %%mm1, %%mm3 \n"
+        "pfmul      %%mm6, %%mm1 \n" // r3*wim
+        "pfsub      %%mm4, %%mm5 \n" // i2*wre - r2*wim
+        "movq       %%mm0, %%mm4 \n"
+        "pfmul      %%mm7, %%mm3 \n" // i3*wim
+        "pfmul      %%mm6, %%mm4 \n" // r3*wre
+        "pfmul      %%mm7, %%mm0 \n" // i3*wre
+        "pfsub      %%mm3, %%mm4 \n" // r3*wre - i3*wim
+        "movq        (%0), %%mm3 \n"
+        "pfadd      %%mm1, %%mm0 \n" // i3*wre + r3*wim
+        "movq       %%mm4, %%mm1 \n"
+        "pfadd      %%mm2, %%mm4 \n" // t5
+        "pfsub      %%mm2, %%mm1 \n" // t3
+        "pfsub      %%mm4, %%mm3 \n" // r2
+        "movq       %%mm3, (%0,%3,4) \n"
+        "pfadd       (%0), %%mm4 \n" // r0
+        "movq       %%mm4, (%0) \n"
+        "movq       %%mm5, %%mm4 \n"
+        "pfadd      %%mm0, %%mm5 \n" // t6
+        "movq  8(%0,%3,2), %%mm3 \n"
+        "pfsub      %%mm1, %%mm3 \n" // i3
+        "pfsub      %%mm0, %%mm4 \n" // t4
+        "movq       %%mm3, 8(%0,%4) \n"
+        "movq   (%0,%3,2), %%mm3 \n"
+        "pfadd 8(%0,%3,2), %%mm1 \n" // i1
+        "pfsub      %%mm4, %%mm3 \n" // r3
+        "movq       %%mm1, 8(%0,%3,2) \n"
+        "movq       %%mm3, (%0,%4) \n"
+        "pfadd  (%0,%3,2), %%mm4 \n" // r1
+        "movq       8(%0), %%mm3 \n"
+        "movq       %%mm4, (%0,%3,2) \n"
+        "pfsub      %%mm5, %%mm3 \n" // i2
+        "pfadd      8(%0), %%mm5 \n" // i0
+        "movq       %%mm3, 8(%0,%3,4) \n"
+        "movq       %%mm5, 8(%0) \n"
+        "add $16, %0 \n"
+        "add $8, %1 \n"
+        "dec %2 \n"
+        "jg 1b \n"
+        :"+r"(z), "+r"(w), "+r"(n)
+        :"r"(n*8), "r"(n*48)
+        :"memory"
     );
-    /* pass 2 .. ln-1 */
+}
+
+static void fft16(FFTComplex *z);
+#include "../fft_template.h"
+DECL_FFT(16,8,4,)
 
-    nblocks = 1 << (ln-3);
-    nloops = 1 << 2;
-    cptr = s->exptab1;
-    do {
-        p = z;
-        j = nblocks;
-        do {
-            i = nloops*8;
-            asm volatile(
-                "1: \n\t"
-                "sub $16, %0 \n\t"
-                "movq    (%1,%0), %%mm0 \n\t"
-                "movq   8(%1,%0), %%mm1 \n\t"
-                "movq    (%2,%0), %%mm2 \n\t"
-                "movq   8(%2,%0), %%mm3 \n\t"
-                "movq  (%3,%0,2), %%mm4 \n\t"
-                "movq 8(%3,%0,2), %%mm5 \n\t"
-                "pswapd    %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
-                "pswapd    %%mm5, %%mm7 \n\t"
-                "pfmul     %%mm2, %%mm4 \n\t" // cre*re cim*im
-                "pfmul     %%mm3, %%mm5 \n\t"
-                "pfmul     %%mm2, %%mm6 \n\t" // cim*re cre*im
-                "pfmul     %%mm3, %%mm7 \n\t"
-                "pfpnacc   %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
-                "pfpnacc   %%mm7, %%mm5 \n\t"
-                "movq      %%mm0, %%mm2 \n\t"
-                "movq      %%mm1, %%mm3 \n\t"
-                "pfadd     %%mm4, %%mm0 \n\t"
-                "pfadd     %%mm5, %%mm1 \n\t"
-                "pfsub     %%mm4, %%mm2 \n\t"
-                "pfsub     %%mm5, %%mm3 \n\t"
-                "movq      %%mm0,  (%1,%0) \n\t"
-                "movq      %%mm1, 8(%1,%0) \n\t"
-                "movq      %%mm2,  (%2,%0) \n\t"
-                "movq      %%mm3, 8(%2,%0) \n\t"
-                "jg 1b \n\t"
-                :"+r"(i)
-                :"r"(p), "r"(p + nloops), "r"(cptr)
-            );
-            p += nloops*2;
-        } while (--j);
-        cptr += nloops*2;
-        nblocks >>= 1;
-        nloops <<= 1;
-    } while (nblocks != 0);
+void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
+{
+    int n = 1<<s->nbits;
+    int i;
+    fft_dispatch[s->nbits-2](z);
     asm volatile("femms");
+    for(i=0; i<n; i+=2)
+        FFSWAP(FFTSample, z[i].im, z[i+1].re);
 }
 
 static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c
index 305f44a..32b05d4 100644
--- a/libavcodec/i386/fft_sse.c
+++ b/libavcodec/i386/fft_sse.c
@@ -1,6 +1,6 @@
 /*
  * FFT/MDCT transform with SSE optimizations
- * Copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2008 Loren Merritt
  *
  * This file is part of FFmpeg.
  *
@@ -22,124 +22,368 @@
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/dsputil.h"
 
-static const int p1p1p1m1[4] __attribute__((aligned(16))) =
-    { 0, 0, 0, 1 << 31 };
+DECLARE_ALIGNED_16(static const FFTSample, root2[4])  = { M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,  M_SQRT1_2};
+DECLARE_ALIGNED_16(static const FFTSample, root2mppm[4]) = {-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2};
+DECLARE_ALIGNED_16(static const int, m1m1m1m1[4]) = { 1<<31, 1<<31, 1<<31, 1<<31 };
+DECLARE_ALIGNED_16(static const int, p1m1p1m1[4]) = { 0, 1<<31, 0, 1<<31 };
 
-static const int p1p1m1p1[4] __attribute__((aligned(16))) =
-    { 0, 0, 1 << 31, 0 };
+// in:  r0={r0,i0,r1,i1} t0={r2,i2,r3,i3}
+// out: r0={r0,r1,r2,r3} i0={i0,i1,i2,i3}
+#define T4(r0,i0,t0)\
+    asm("movaps        "#r0", "#i0" \n"\
+        "shufps $0x64, "#t0", "#r0" \n" /* {r0,i0,r3,i2} */\
+        "shufps $0xce, "#t0", "#i0" \n" /* {r1,i1,r2,i3} */\
+        "movaps        "#r0", "#t0" \n"\
+        "addps         "#i0", "#r0" \n" /* {t1,t2,t6,t5} */\
+        "subps         "#i0", "#t0" \n" /* {t3,t4,t8,t7} */\
+        "movaps        "#r0", "#i0" \n"\
+        "shufps $0x44, "#t0", "#r0" \n" /* {t1,t2,t3,t4} */\
+        "shufps $0xbe, "#t0", "#i0" \n" /* {t6,t5,t7,t8} */\
+        "movaps        "#r0", "#t0" \n"\
+        "addps         "#i0", "#r0" \n" /* {r0,i0,r1,i1} */\
+        "subps         "#i0", "#t0" \n" /* {r2,i2,r3,i3} */\
+        "movaps        "#r0", "#i0" \n"\
+        "shufps $0x88, "#t0", "#r0" \n" /* {r0,r1,r2,r3} */\
+        "shufps $0xdd, "#t0", "#i0" \n" /* {i0,i1,i2,i3} */\
+        :\
+    );
 
-static const int p1p1m1m1[4] __attribute__((aligned(16))) =
-    { 0, 0, 1 << 31, 1 << 31 };
+#define T8(r0,i0,r1,i1,t0,t1)\
+    asm("movaps        "#r1", "#t0" \n"\
+        "shufps $0x44, "#i1", "#r1" \n" /* {r4,i4,r6,i6} */\
+        "shufps $0xee, "#i1", "#t0" \n" /* {r5,i5,r7,i7} */\
+        "movaps        "#r1", "#t1" \n"\
+        "subps         "#t0", "#r1" \n" /* {r5,i5,r7,i7} */\
+        "addps         "#t0", "#t1" \n" /* {t1,t2,t3,t4} */\
+        "movaps        "#r1", "#t0" \n"\
+        "shufps $0xb1, "#t0", "#t0" \n" /* {i5,r5,i7,r7} */\
+        "mulps           %1,  "#r1" \n" /* {-r5,i5,r7,-i7} */\
+        "mulps           %0,  "#t0" \n"\
+        "addps         "#t0", "#r1" \n" /* {t8,t7,ta,t9} */\
+        "movaps        "#t1", "#t0" \n"\
+        "shufps $0x36, "#r1", "#t1" \n" /* {t3,t2,t9,t8} */\
+        "shufps $0x9c, "#r1", "#t0" \n" /* {t1,t4,t7,ta} */\
+        "movaps        "#t1", "#r1" \n"\
+        "addps         "#t0", "#t1" \n" /* {t1,t2,t9,ta} */\
+        "subps         "#t0", "#r1" \n" /* {t6,t5,tc,tb} */\
+        "movaps        "#t1", "#t0" \n"\
+        "shufps $0xd8, "#r1", "#t1" \n" /* {t1,t9,t5,tb} */\
+        "shufps $0x8d, "#r1", "#t0" \n" /* {t2,ta,t6,tc} */\
+        "movaps        "#r0", "#r1" \n"\
+        "movaps        "#i0", "#i1" \n"\
+        "addps         "#t1", "#r0" \n" /* {r0,r1,r2,r3} */\
+        "addps         "#t0", "#i0" \n" /* {i0,i1,i2,i3} */\
+        "subps         "#t1", "#r1" \n" /* {r4,r5,r6,r7} */\
+        "subps         "#t0", "#i1" \n" /* {i4,i5,i6,i7} */\
+        ::"m"(*root2), "m"(*root2mppm)\
+    );
 
-static const int p1m1p1m1[4] __attribute__((aligned(16))) =
-    { 0, 1 << 31, 0, 1 << 31 };
+#define LOAD(mem,xmm)\
+    asm("movaps %0, "#xmm ::"m"(mem))
 
-static const int m1m1m1m1[4] __attribute__((aligned(16))) =
-    { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
+#define SAVE(mem,xmm)\
+    asm("movaps "#xmm", %0" :"=m"(mem))
 
-#if 0
-static void print_v4sf(const char *str, __m128 a)
+static void fft4(FFTComplex *z)
 {
-    float *p = (float *)&a;
-    printf("%s: %f %f %f %f\n",
-           str, p[0], p[1], p[2], p[3]);
+    LOAD(z[0], %%xmm0);
+    LOAD(z[2], %%xmm2);
+    T4(%%xmm0, %%xmm1, %%xmm2);
+    SAVE(z[0], %%xmm0);
+    SAVE(z[2], %%xmm1);
 }
-#endif
 
-/* XXX: handle reverse case */
-void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
+static void fft8(FFTComplex *z)
 {
-    int ln = s->nbits;
-    x86_reg i;
-    long j;
-    long nblocks, nloops;
-    FFTComplex *p, *cptr;
+    LOAD(z[0], %%xmm0);
+    LOAD(z[2], %%xmm2);
+    T4(%%xmm0, %%xmm1, %%xmm2);
+    LOAD(z[4], %%xmm2);
+    LOAD(z[6], %%xmm3);
+    T8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5);
+    SAVE(z[0], %%xmm0);
+    SAVE(z[2], %%xmm1);
+    SAVE(z[4], %%xmm2);
+    SAVE(z[6], %%xmm3);
+}
 
+static void fft16(FFTComplex *z)
+{
+    LOAD(z[0], %%xmm0);
+    LOAD(z[2], %%xmm2);
+    T4(%%xmm0, %%xmm1, %%xmm2);
+    LOAD(z[4], %%xmm2);
+    LOAD(z[6], %%xmm3);
+    T8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5);
+    LOAD(z[8], %%xmm4);
+    LOAD(z[10], %%xmm6);
+    SAVE(z[0], %%xmm0);
+    SAVE(z[2], %%xmm1);
+    SAVE(z[4], %%xmm2);
+    SAVE(z[6], %%xmm3);
+    T4(%%xmm4, %%xmm5, %%xmm6);
+    LOAD(z[12], %%xmm6);
+    LOAD(z[14], %%xmm0);
+    T4(%%xmm6, %%xmm7, %%xmm0);
+
+    asm("movaps          %1, %%xmm0 \n" // wre
+        "movaps      %%xmm4, %%xmm2 \n"
+        "movaps       16+%1, %%xmm1 \n" // wim
+        "movaps      %%xmm5, %%xmm3 \n"
+        "mulps       %%xmm0, %%xmm2 \n" // r2*wre
+        "mulps       %%xmm1, %%xmm3 \n" // i2*wim
+        "mulps       %%xmm1, %%xmm4 \n" // r2*wim
+        "mulps       %%xmm0, %%xmm5 \n" // i2*wre
+        "addps       %%xmm3, %%xmm2 \n" // r2*wre + i2*wim
+        "movaps      %%xmm1, %%xmm3 \n"
+        "mulps       %%xmm6, %%xmm1 \n" // r3*wim
+        "subps       %%xmm4, %%xmm5 \n" // i2*wre - r2*wim
+        "movaps      %%xmm0, %%xmm4 \n"
+        "mulps       %%xmm7, %%xmm3 \n" // i3*wim
+        "mulps       %%xmm6, %%xmm4 \n" // r3*wre
+        "mulps       %%xmm7, %%xmm0 \n" // i3*wre
+        "subps       %%xmm3, %%xmm4 \n" // r3*wre - i3*wim
+        "movaps    0x00(%0), %%xmm3 \n"
+        "addps       %%xmm1, %%xmm0 \n" // i3*wre + r3*wim
+        "movaps      %%xmm4, %%xmm1 \n"
+        "addps       %%xmm2, %%xmm4 \n" // t5
+        "subps       %%xmm2, %%xmm1 \n" // t3
+        "subps       %%xmm4, %%xmm3 \n" // r2
+        "addps     0x00(%0), %%xmm4 \n" // r0
+        "movaps    0x20(%0), %%xmm6 \n"
+        "movaps      %%xmm3, 0x40(%0) \n"
+        "movaps      %%xmm4, 0x00(%0) \n"
+        "movaps      %%xmm5, %%xmm3 \n"
+        "subps       %%xmm0, %%xmm5 \n" // t4
+        "movaps      %%xmm6, %%xmm4 \n"
+        "subps       %%xmm5, %%xmm6 \n" // r3
+        "addps       %%xmm4, %%xmm5 \n" // r1
+        "movaps      %%xmm6, 0x60(%0) \n"
+        "movaps      %%xmm5, 0x20(%0) \n"
+        "movaps    0x30(%0), %%xmm2 \n"
+        "addps       %%xmm0, %%xmm3 \n" // t6
+        "subps       %%xmm1, %%xmm2 \n" // i3
+        "movaps    0x10(%0), %%xmm7 \n"
+        "addps     0x30(%0), %%xmm1 \n" // i1
+        "movaps      %%xmm2, 0x70(%0) \n"
+        "movaps      %%xmm1, 0x30(%0) \n"
+        "movaps      %%xmm7, %%xmm4 \n"
+        "subps       %%xmm3, %%xmm7 \n" // i2
+        "addps       %%xmm4, %%xmm3 \n" // i0
+        "movaps      %%xmm7, 0x50(%0) \n"
+        "movaps      %%xmm3, 0x10(%0) \n"
+        ::"r"(z), "m"(*ff_cos_16)
+        :"memory"
+    );
+}
+
+/* z[0...8n-1], w[1...2n-1] */
+static av_noinline void pass(FFTComplex *z, const FFTSample *w, uintptr_t n)
+{
     asm volatile(
-        "movaps %0, %%xmm4 \n\t"
-        "movaps %1, %%xmm5 \n\t"
-        ::"m"(*p1p1m1m1),
-          "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
+        "1: \n"
+        "movaps   (%0,%3,4), %%xmm4 \n" // r2
+        "movaps 16(%0,%3,4), %%xmm5 \n" // i2
+        "movaps      %%xmm4, %%xmm2 \n"
+        "movaps        (%1), %%xmm0 \n" // wre
+        "movaps      %%xmm5, %%xmm3 \n"
+        "movaps     (%1,%3), %%xmm1 \n" // wim
+        "mulps       %%xmm0, %%xmm2 \n" // r2*wre
+        "movaps     (%0,%4), %%xmm6 \n" // r3
+        "mulps       %%xmm1, %%xmm3 \n" // i2*wim
+        "movaps   16(%0,%4), %%xmm7 \n" // i3
+        "mulps       %%xmm1, %%xmm4 \n" // r2*wim
+        "mulps       %%xmm0, %%xmm5 \n" // i2*wre
+        "addps       %%xmm3, %%xmm2 \n" // r2*wre + i2*wim
+        "movaps      %%xmm1, %%xmm3 \n"
+        "mulps       %%xmm6, %%xmm1 \n" // r3*wim
+        "subps       %%xmm4, %%xmm5 \n" // i2*wre - r2*wim
+        "movaps      %%xmm0, %%xmm4 \n"
+        "mulps       %%xmm7, %%xmm3 \n" // i3*wim
+        "mulps       %%xmm6, %%xmm4 \n" // r3*wre
+        "mulps       %%xmm7, %%xmm0 \n" // i3*wre
+        "subps       %%xmm3, %%xmm4 \n" // r3*wre - i3*wim
+        "movaps        (%0), %%xmm3 \n"
+        "addps       %%xmm1, %%xmm0 \n" // i3*wre + r3*wim
+        "movaps      %%xmm4, %%xmm1 \n"
+        "addps       %%xmm2, %%xmm4 \n" // t5
+        "subps       %%xmm2, %%xmm1 \n" // t3
+        "subps       %%xmm4, %%xmm3 \n" // r2
+        "addps         (%0), %%xmm4 \n" // r0
+        "movaps   (%0,%3,2), %%xmm6 \n"
+        "movaps      %%xmm3, (%0,%3,4) \n"
+        "movaps      %%xmm4, (%0) \n"
+        "movaps      %%xmm5, %%xmm3 \n"
+        "subps       %%xmm0, %%xmm5 \n" // t4
+        "movaps      %%xmm6, %%xmm4 \n"
+        "subps       %%xmm5, %%xmm6 \n" // r3
+        "addps       %%xmm4, %%xmm5 \n" // r1
+        "movaps      %%xmm6, (%0,%4) \n"
+        "movaps      %%xmm5, (%0,%3,2) \n"
+        "movaps 16(%0,%3,2), %%xmm2 \n"
+        "addps       %%xmm0, %%xmm3 \n" // t6
+        "subps       %%xmm1, %%xmm2 \n" // i3
+        "movaps      16(%0), %%xmm7 \n"
+        "addps  16(%0,%3,2), %%xmm1 \n" // i1
+        "movaps      %%xmm2, 16(%0,%4) \n"
+        "movaps      %%xmm1, 16(%0,%3,2) \n"
+        "movaps      %%xmm7, %%xmm4 \n"
+        "subps       %%xmm3, %%xmm7 \n" // i2
+        "addps       %%xmm4, %%xmm3 \n" // i0
+        "movaps      %%xmm7, 16(%0,%3,4) \n"
+        "movaps      %%xmm3, 16(%0) \n"
+        "add $32, %0 \n"
+        "add $16, %1 \n"
+        "sub $2, %2 \n"
+        "jg 1b \n"
+        :"+r"(z), "+r"(w), "+r"(n)
+        :"r"(n*8), "r"(n*48)
+        :"memory"
     );
+}
 
-    i = 8 << ln;
+static av_noinline void pass_interleave(FFTComplex *z, const FFTSample *w, uintptr_t n)
+{
     asm volatile(
-        "1: \n\t"
-        "sub $32, %0 \n\t"
-        /* do the pass 0 butterfly */
-        "movaps   (%0,%1), %%xmm0 \n\t"
-        "movaps    %%xmm0, %%xmm1 \n\t"
-        "shufps     $0x4E, %%xmm0, %%xmm0 \n\t"
-        "xorps     %%xmm4, %%xmm1 \n\t"
-        "addps     %%xmm1, %%xmm0 \n\t"
-        "movaps 16(%0,%1), %%xmm2 \n\t"
-        "movaps    %%xmm2, %%xmm3 \n\t"
-        "shufps     $0x4E, %%xmm2, %%xmm2 \n\t"
-        "xorps     %%xmm4, %%xmm3 \n\t"
-        "addps     %%xmm3, %%xmm2 \n\t"
-        /* multiply third by -i */
-        /* by toggling the sign bit */
-        "shufps     $0xB4, %%xmm2, %%xmm2 \n\t"
-        "xorps     %%xmm5, %%xmm2 \n\t"
-        /* do the pass 1 butterfly */
-        "movaps    %%xmm0, %%xmm1 \n\t"
-        "addps     %%xmm2, %%xmm0 \n\t"
-        "subps     %%xmm2, %%xmm1 \n\t"
-        "movaps    %%xmm0,   (%0,%1) \n\t"
-        "movaps    %%xmm1, 16(%0,%1) \n\t"
-        "jg 1b \n\t"
-        :"+r"(i)
-        :"r"(z)
+        "1: \n"
+        "movaps   (%0,%3,4), %%xmm4 \n" // r2
+        "movaps 16(%0,%3,4), %%xmm5 \n" // i2
+        "movaps      %%xmm4, %%xmm2 \n"
+        "movaps        (%1), %%xmm0 \n" // wre
+        "movaps      %%xmm5, %%xmm3 \n"
+        "movaps     (%1,%3), %%xmm1 \n" // wim
+        "mulps       %%xmm0, %%xmm2 \n" // r2*wre
+        "movaps     (%0,%4), %%xmm6 \n" // r3
+        "mulps       %%xmm1, %%xmm3 \n" // i2*wim
+        "movaps   16(%0,%4), %%xmm7 \n" // i3
+        "mulps       %%xmm1, %%xmm4 \n" // r2*wim
+        "mulps       %%xmm0, %%xmm5 \n" // i2*wre
+        "addps       %%xmm3, %%xmm2 \n" // r2*wre + i2*wim
+        "movaps      %%xmm1, %%xmm3 \n"
+        "mulps       %%xmm6, %%xmm1 \n" // r3*wim
+        "subps       %%xmm4, %%xmm5 \n" // i2*wre - r2*wim
+        "movaps      %%xmm0, %%xmm4 \n"
+        "mulps       %%xmm7, %%xmm3 \n" // i3*wim
+        "mulps       %%xmm6, %%xmm4 \n" // r3*wre
+        "mulps       %%xmm7, %%xmm0 \n" // i3*wre
+        "subps       %%xmm3, %%xmm4 \n" // r3*wre - i3*wim
+        "movaps        (%0), %%xmm3 \n"
+        "addps       %%xmm1, %%xmm0 \n" // i3*wre + r3*wim
+        "movaps      %%xmm4, %%xmm1 \n"
+        "addps       %%xmm2, %%xmm4 \n" // t5
+        "subps       %%xmm2, %%xmm1 \n" // t3
+        "subps       %%xmm4, %%xmm3 \n" // r2
+        "addps         (%0), %%xmm4 \n" // r0
+        "movaps   (%0,%3,2), %%xmm6 \n"
+        "movaps      %%xmm3, (%0,%3,4) \n"
+        "movaps      %%xmm4, (%0) \n"
+        "movaps      %%xmm5, %%xmm3 \n"
+        "subps       %%xmm0, %%xmm5 \n" // t4
+        "movaps      %%xmm6, %%xmm4 \n"
+        "subps       %%xmm5, %%xmm6 \n" // r3
+        "addps       %%xmm4, %%xmm5 \n" // r1
+        "movaps 16(%0,%3,2), %%xmm2 \n"
+        "addps       %%xmm0, %%xmm3 \n" // t6
+        "subps       %%xmm1, %%xmm2 \n" // i3
+        "movaps      16(%0), %%xmm7 \n"
+        "addps  16(%0,%3,2), %%xmm1 \n" // i1
+        "movaps      %%xmm7, %%xmm4 \n"
+        "subps       %%xmm3, %%xmm7 \n" // i2
+        "addps       %%xmm4, %%xmm3 \n" // i0
+        "movaps      %%xmm5, %%xmm4 \n" // r1
+        "movaps      %%xmm6, %%xmm0 \n" // r3
+        "unpcklps    %%xmm1, %%xmm5 \n"
+        "unpckhps    %%xmm1, %%xmm4 \n"
+        "unpcklps    %%xmm2, %%xmm6 \n"
+        "unpckhps    %%xmm2, %%xmm0 \n"
+        "movaps        (%0), %%xmm1 \n"
+        "movaps   (%0,%3,4), %%xmm2 \n"
+        "movaps      %%xmm5,   (%0,%3,2) \n"
+        "movaps      %%xmm4, 16(%0,%3,2) \n"
+        "movaps      %%xmm6,   (%0,%4) \n"
+        "movaps      %%xmm0, 16(%0,%4) \n"
+        "movaps      %%xmm1, %%xmm5 \n" // r0
+        "movaps      %%xmm2, %%xmm4 \n" // r2
+        "unpcklps    %%xmm3, %%xmm1 \n"
+        "unpckhps    %%xmm3, %%xmm5 \n"
+        "unpcklps    %%xmm7, %%xmm2 \n"
+        "unpckhps    %%xmm7, %%xmm4 \n"
+        "movaps      %%xmm1,   (%0) \n"
+        "movaps      %%xmm5, 16(%0) \n"
+        "movaps      %%xmm2,   (%0,%3,4) \n"
+        "movaps      %%xmm4, 16(%0,%3,4) \n"
+        "add $32, %0 \n"
+        "add $16, %1 \n"
+        "sub $2, %2 \n"
+        "jg 1b \n"
+        :"+r"(z), "+r"(w), "+r"(n)
+        :"r"(n*8), "r"(n*48)
+        :"memory"
     );
-    /* pass 2 .. ln-1 */
-
-    nblocks = 1 << (ln-3);
-    nloops = 1 << 2;
-    cptr = s->exptab1;
-    do {
-        p = z;
-        j = nblocks;
-        do {
-            i = nloops*8;
-            asm volatile(
-                "1: \n\t"
-                "sub $32, %0 \n\t"
-                "movaps    (%2,%0), %%xmm1 \n\t"
-                "movaps    (%1,%0), %%xmm0 \n\t"
-                "movaps  16(%2,%0), %%xmm5 \n\t"
-                "movaps  16(%1,%0), %%xmm4 \n\t"
-                "movaps     %%xmm1, %%xmm2 \n\t"
-                "movaps     %%xmm5, %%xmm6 \n\t"
-                "shufps      $0xA0, %%xmm1, %%xmm1 \n\t"
-                "shufps      $0xF5, %%xmm2, %%xmm2 \n\t"
-                "shufps      $0xA0, %%xmm5, %%xmm5 \n\t"
-                "shufps      $0xF5, %%xmm6, %%xmm6 \n\t"
-                "mulps   (%3,%0,2), %%xmm1 \n\t" //  cre*re cim*re
-                "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
-                "mulps 32(%3,%0,2), %%xmm5 \n\t" //  cre*re cim*re
-                "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
-                "addps      %%xmm2, %%xmm1 \n\t"
-                "addps      %%xmm6, %%xmm5 \n\t"
-                "movaps     %%xmm0, %%xmm3 \n\t"
-                "movaps     %%xmm4, %%xmm7 \n\t"
-                "addps      %%xmm1, %%xmm0 \n\t"
-                "subps      %%xmm1, %%xmm3 \n\t"
-                "addps      %%xmm5, %%xmm4 \n\t"
-                "subps      %%xmm5, %%xmm7 \n\t"
-                "movaps     %%xmm0, (%1,%0) \n\t"
-                "movaps     %%xmm3, (%2,%0) \n\t"
-                "movaps     %%xmm4, 16(%1,%0) \n\t"
-                "movaps     %%xmm7, 16(%2,%0) \n\t"
-                "jg 1b \n\t"
-                :"+r"(i)
-                :"r"(p), "r"(p + nloops), "r"(cptr)
-            );
-            p += nloops*2;
-        } while (--j);
-        cptr += nloops*2;
-        nblocks >>= 1;
-        nloops <<= 1;
-    } while (nblocks != 0);
+}
+
+#include "../fft_template.h"
+
+DECL_FFT(32,16,8,_interleave)
+DECL_FFT(64,32,16,_interleave)
+DECL_FFT(128,64,32,_interleave)
+DECL_FFT(256,128,64,_interleave)
+DECL_FFT(512,256,128,_interleave)
+DECL_FFT(1024,512,256,_interleave)
+DECL_FFT(2048,1024,512,_interleave)
+DECL_FFT(4096,2048,1024,_interleave)
+DECL_FFT(8192,4096,2048,_interleave)
+DECL_FFT(16384,8192,4096,_interleave)
+DECL_FFT(32768,16384,8192,_interleave)
+DECL_FFT(65536,32768,16384,_interleave)
+
+static void (*fft_dispatch_interleave[])(FFTComplex*) = {
+    fft4, fft8, fft16,
+    fft32_interleave, fft64_interleave, fft128_interleave, fft256_interleave,
+    fft512_interleave, fft1024_interleave, fft2048_interleave, fft4096_interleave,
+    fft8192_interleave, fft16384_interleave, fft32768_interleave, fft65536_interleave,
+};
+
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
+{
+    int n = 1 << s->nbits;
+
+    fft_dispatch_interleave[s->nbits-2](z);
+
+    if(n <= 16) {
+        x86_reg i = -8*n;
+        asm volatile(
+            "1: \n"
+            "movaps     (%0,%1), %%xmm0 \n"
+            "movaps      %%xmm0, %%xmm1 \n"
+            "unpcklps 16(%0,%1), %%xmm0 \n"
+            "unpckhps 16(%0,%1), %%xmm1 \n"
+            "movaps      %%xmm0,   (%0,%1) \n"
+            "movaps      %%xmm1, 16(%0,%1) \n"
+            "add $32, %0 \n"
+            "jl 1b \n"
+            :"+r"(i)
+            :"r"(z+n)
+            :"memory"
+        );
+    }
+}
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
+{
+    int n = 1 << s->nbits;
+    int i;
+    for(i=0; i<n; i+=2) {
+        asm volatile(
+            "movaps %2, %%xmm0 \n"
+            "movlps %%xmm0, %0 \n"
+            "movhps %%xmm0, %1 \n"
+            :"=m"(s->tmp_buf[s->revtab[i]]),
+             "=m"(s->tmp_buf[s->revtab[i+1]])
+            :"m"(z[i])
+        );
+    }
+    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
 }
 
 static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
-- 
1.5.5.1