[FFmpeg-devel] [PATCH] split-radix FFT
Loren Merritt
lorenm
Sat Jul 26 04:14:00 CEST 2008
$subject, vaguely based on djbfft.
Changed from djb:
* added simd.
* removed the hand-scheduled pentium-pro code. gcc's output from simple C
is better on all cpus I have access to.
* removed the distinction between fft and ifft. they're just permutations
of eachother, so the difference belongs in revtab[] and not in the code.
* removed the distinction between pass() and pass_big(). C can always use
the memory-efficient version, and simd never does because the shuffles are
too costly.
* made an entirely different pass_big(), to avoid store->load aliasing.
I tried the tangent FFT, but I couldn't make it faster than split-radix.
Tangent has asymptotically 5% fewer arithmetic ops, but only 1-2% for
sizes typical of audio codecs, and even a couple extra shuffles or other
overhead pushes it over.
I tried an in-place fft_permute, but it wasn't any faster than out-of-place
+ memcpy, and quite a bit more complex.
benchmarks (cycles):
2^4 2^5 2^6 2^7 2^8 2^9 2^10 2^11 2^12 2^13 2^14 2^15 2^16 fft size
368 1204 2925 6958 15864 35742 79907 176930 391877 954522 2582623 7027105 15979384 k8.svn.c
201 641 1666 4168 9730 22360 49814 111063 245835 551301 2140560 5449270 14161100 k8.djb.c
194 627 1656 4169 9923 22881 51998 116276 257871 577594 1983282 4926132 12508115 k8.split.c
148 526 1546 3537 7757 17209 38176 85052 189656 461192 1180269 2665751 6856414 k8.svn.sse
122 459 1380 3111 6687 14719 32318 71412 165771 418068 1112945 2575784 6684035 k8.svn.3dn2
173 478 1208 2832 6397 14097 30816 66861 145040 327352 1163446 2907148 7465755 k8.split.3dn2
114 363 934 2206 5190 11527 25515 55826 122329 272338 821973 1974221 4931610 k8.split.sse
298 870 2196 4998 11462 25954 58196 129402 290652 652258 1515451 3379696 7337748 conroe.svn.c
138 396 1016 2453 5703 13080 29542 72980 166937 382352 852779 1965809 4289844 conroe.djb.c
132 383 959 2443 5691 12957 29167 64625 141885 310046 670222 1449479 3131753 conroe.split.c
114 390 926 1978 4112 8825 19316 42252 94760 214178 462689 994594 2200585 conroe.svn.sse
82 216 498 1221 2630 5715 12265 26415 58853 126274 269532 568314 1220458 conroe.split.sse
318 932 2430 5578 12857 29093 65321 145709 327347 745524 1763483 3977390 8756696 penryn.svn.c
129 420 1146 2824 6670 15907 35248 87700 199251 454614 1015788 2309352 5068660 penryn.djb.c
151 424 1094 2746 6369 14604 32945 73085 161065 354042 768205 1672258 3601885 penryn.split.c
72 335 834 1776 3626 7733 17273 38134 88629 204577 449889 982217 2171035 penryn.svn.sse
36 144 418 1054 2343 5127 11289 24670 56975 124178 268514 578586 1262736 penryn.split.sse
729 1869 4909 12412 28876 65702 147951 337286 786350 1770779 3988037 10739966 42763027 prescott.svn.c
463 1224 3179 7644 18052 41395 91632 206204 450451 985720 2173172 4685558 10307042 prescott.djb.c
434 1155 2894 6963 16356 37468 87928 202304 457022 1012743 2257329 5002317 10999607 prescott.split.c
222 522 1208 2909 6858 15517 34318 83046 193854 465508 1221169 5221734 17064482 prescott.svn.sse
124 370 850 2003 4377 10050 22798 51432 113311 254526 590618 1548052 3930814 prescott.split.sse
--Loren Merritt
-------------- next part --------------
---
libavcodec/dsputil.h | 24 +++-
libavcodec/fft.c | 351 +++++++++++++++++++++-------------
libavcodec/fft_template.h | 53 +++++
libavcodec/i386/fft_3dn.c | 111 +-----------
libavcodec/i386/fft_3dn2.c | 283 +++++++++++++++++++---------
libavcodec/i386/fft_sse.c | 452 ++++++++++++++++++++++++++++++++++----------
6 files changed, 840 insertions(+), 434 deletions(-)
create mode 100644 libavcodec/fft_template.h
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 7a47b87..cf1a467 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -638,6 +638,8 @@ typedef struct FFTContext {
uint16_t *revtab;
FFTComplex *exptab;
FFTComplex *exptab1; /* only used by SSE code */
+ FFTComplex *tmp_buf;
+ void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
@@ -646,13 +648,18 @@ typedef struct FFTContext {
} FFTContext;
int ff_fft_init(FFTContext *s, int nbits, int inverse);
-void ff_fft_permute(FFTContext *s, FFTComplex *z);
+void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
+static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
+{
+ s->fft_permute(s, z);
+}
static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
{
s->fft_calc(s, z);
@@ -670,6 +677,21 @@ typedef struct MDCTContext {
FFTContext fft;
} MDCTContext;
+/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
+extern FFTSample ff_cos_16[8];
+extern FFTSample ff_cos_32[16];
+extern FFTSample ff_cos_64[32];
+extern FFTSample ff_cos_128[64];
+extern FFTSample ff_cos_256[128];
+extern FFTSample ff_cos_512[256];
+extern FFTSample ff_cos_1024[512];
+extern FFTSample ff_cos_2048[1024];
+extern FFTSample ff_cos_4096[2048];
+extern FFTSample ff_cos_8192[4096];
+extern FFTSample ff_cos_16384[8192];
+extern FFTSample ff_cos_32768[16384];
+extern FFTSample ff_cos_65536[32768];
+
/**
* Generate a Kaiser-Bessel Derived Window.
* @param window pointer to half window
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index 47e9e06..2c1bb8c 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -1,6 +1,8 @@
/*
* FFT/IFFT transforms
+ * Copyright (c) 2008 Loren Merritt
* Copyright (c) 2002 Fabrice Bellard.
+ * Partly based on libdjbfft, Copyright (c) 1999 D. J. Bernstein
*
* This file is part of FFmpeg.
*
@@ -26,6 +28,38 @@
#include "dsputil.h"
+DECLARE_ALIGNED_16(FFTSample, ff_cos_16[8]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_32[16]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_64[32]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_128[64]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_256[128]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_512[256]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_1024[512]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_2048[1024]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_4096[2048]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_8192[4096]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_16384[8192]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_32768[16384]);
+DECLARE_ALIGNED_16(FFTSample, ff_cos_65536[32768]);
+static FFTSample *ff_cos_tabs[] = {
+ ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024,
+ ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536,
+};
+
+static int split_radix_permutation(int i, int n, int inverse)
+{
+ int m;
+ if(n <= 2) return i;
+ m = n >> 1;
+ if(i < m) return split_radix_permutation(i,m,inverse) << 1;
+ i -= m;
+ m >>= 1;
+ if(!inverse) i ^= m;
+ if(i < m) return (split_radix_permutation(i,m,inverse) << 2) + 1;
+ i -= m;
+ return ((split_radix_permutation(i,m,inverse) << 2) - 1) & (n - 1);
+}
+
/**
* The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is
* done
@@ -34,12 +68,15 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
{
int i, j, m, n;
float alpha, c1, s1, s2;
- int shuffle = 0;
+ int split_radix = 1;
int av_unused has_vectors;
+ if (nbits < 2 || nbits > 16)
+ goto fail;
s->nbits = nbits;
n = 1 << nbits;
+ s->tmp_buf = NULL;
s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
if (!s->exptab)
goto fail;
@@ -50,13 +87,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
s2 = inverse ? 1.0 : -1.0;
- for(i=0;i<(n/2);i++) {
- alpha = 2 * M_PI * (float)i / (float)n;
- c1 = cos(alpha);
- s1 = sin(alpha) * s2;
- s->exptab[i].re = c1;
- s->exptab[i].im = s1;
- }
+ s->fft_permute = ff_fft_permute_c;
s->fft_calc = ff_fft_calc_c;
s->imdct_calc = ff_imdct_calc;
s->imdct_half = ff_imdct_half;
@@ -64,36 +95,55 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
#ifdef HAVE_MMX
has_vectors = mm_support();
- shuffle = 1;
- if (has_vectors & MM_3DNOWEXT) {
- /* 3DNowEx for K7/K8 */
+ if (has_vectors & MM_SSE) {
+ /* SSE for P3/P4/K8 */
+ s->imdct_calc = ff_imdct_calc_sse;
+ s->imdct_half = ff_imdct_half_sse;
+ s->fft_permute = ff_fft_permute_sse;
+ s->fft_calc = ff_fft_calc_sse;
+ } else if (has_vectors & MM_3DNOWEXT) {
+ /* 3DNowEx for K7 */
s->imdct_calc = ff_imdct_calc_3dn2;
s->imdct_half = ff_imdct_half_3dn2;
s->fft_calc = ff_fft_calc_3dn2;
} else if (has_vectors & MM_3DNOW) {
/* 3DNow! for K6-2/3 */
s->fft_calc = ff_fft_calc_3dn;
- } else if (has_vectors & MM_SSE) {
- /* SSE for P3/P4 */
- s->imdct_calc = ff_imdct_calc_sse;
- s->imdct_half = ff_imdct_half_sse;
- s->fft_calc = ff_fft_calc_sse;
- } else {
- shuffle = 0;
}
#elif defined HAVE_ALTIVEC && !defined ALTIVEC_USE_REFERENCE_C_CODE
has_vectors = mm_support();
if (has_vectors & MM_ALTIVEC) {
s->fft_calc = ff_fft_calc_altivec;
- shuffle = 1;
+ split_radix = 0;
}
#endif
/* compute constant table for HAVE_SSE version */
- if (shuffle) {
+ if (split_radix) {
+ for(j=4; j<=nbits; j++) {
+ int m = 1<<j;
+ double freq = 2*M_PI/m;
+ FFTSample *tab = ff_cos_tabs[j-4];
+ for(i=0; i<=m/4; i++)
+ tab[i] = cos(i*freq);
+ for(i=1; i<m/4; i++)
+ tab[m/2-i] = tab[i];
+ }
+ for(i=0; i<n; i++)
+ s->revtab[(n - split_radix_permutation(i, n, s->inverse)) % n] = i;
+ s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
+ } else {
int np, nblocks, np2, l;
FFTComplex *q;
+ for(i=0; i<(n/2); i++) {
+ alpha = 2 * M_PI * (float)i / (float)n;
+ c1 = cos(alpha);
+ s1 = sin(alpha) * s2;
+ s->exptab[i].re = c1;
+ s->exptab[i].im = s1;
+ }
+
np = 1 << nbits;
nblocks = np >> 3;
np2 = np >> 1;
@@ -116,137 +166,43 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
nblocks = nblocks >> 1;
} while (nblocks != 0);
av_freep(&s->exptab);
- }
- /* compute bit reverse table */
-
- for(i=0;i<n;i++) {
- m=0;
- for(j=0;j<nbits;j++) {
- m |= ((i >> j) & 1) << (nbits-j-1);
+ for(i=0;i<n;i++) {
+ m=0;
+ for(j=0;j<nbits;j++) {
+ m |= ((i >> j) & 1) << (nbits-j-1);
+ }
+ s->revtab[i]=m;
}
- s->revtab[i]=m;
}
+
return 0;
fail:
av_freep(&s->revtab);
av_freep(&s->exptab);
av_freep(&s->exptab1);
+ av_freep(&s->tmp_buf);
return -1;
}
-/* butter fly op */
-#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
-{\
- FFTSample ax, ay, bx, by;\
- bx=pre1;\
- by=pim1;\
- ax=qre1;\
- ay=qim1;\
- pre = (bx + ax);\
- pim = (by + ay);\
- qre = (bx - ax);\
- qim = (by - ay);\
-}
-
-#define MUL16(a,b) ((a) * (b))
-
-#define CMUL(pre, pim, are, aim, bre, bim) \
-{\
- pre = (MUL16(are, bre) - MUL16(aim, bim));\
- pim = (MUL16(are, bim) + MUL16(bre, aim));\
-}
-
-/**
- * Do a complex FFT with the parameters defined in ff_fft_init(). The
- * input data must be permuted before with s->revtab table. No
- * 1.0/sqrt(n) normalization is done.
- */
-void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
-{
- int ln = s->nbits;
- int j, np, np2;
- int nblocks, nloops;
- register FFTComplex *p, *q;
- FFTComplex *exptab = s->exptab;
- int l;
- FFTSample tmp_re, tmp_im;
-
- np = 1 << ln;
-
- /* pass 0 */
-
- p=&z[0];
- j=(np >> 1);
- do {
- BF(p[0].re, p[0].im, p[1].re, p[1].im,
- p[0].re, p[0].im, p[1].re, p[1].im);
- p+=2;
- } while (--j != 0);
-
- /* pass 1 */
-
-
- p=&z[0];
- j=np >> 2;
- if (s->inverse) {
- do {
- BF(p[0].re, p[0].im, p[2].re, p[2].im,
- p[0].re, p[0].im, p[2].re, p[2].im);
- BF(p[1].re, p[1].im, p[3].re, p[3].im,
- p[1].re, p[1].im, -p[3].im, p[3].re);
- p+=4;
- } while (--j != 0);
- } else {
- do {
- BF(p[0].re, p[0].im, p[2].re, p[2].im,
- p[0].re, p[0].im, p[2].re, p[2].im);
- BF(p[1].re, p[1].im, p[3].re, p[3].im,
- p[1].re, p[1].im, p[3].im, -p[3].re);
- p+=4;
- } while (--j != 0);
- }
- /* pass 2 .. ln-1 */
-
- nblocks = np >> 3;
- nloops = 1 << 2;
- np2 = np >> 1;
- do {
- p = z;
- q = z + nloops;
- for (j = 0; j < nblocks; ++j) {
- BF(p->re, p->im, q->re, q->im,
- p->re, p->im, q->re, q->im);
-
- p++;
- q++;
- for(l = nblocks; l < np2; l += nblocks) {
- CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
- BF(p->re, p->im, q->re, q->im,
- p->re, p->im, tmp_re, tmp_im);
- p++;
- q++;
- }
-
- p += nloops;
- q += nloops;
- }
- nblocks = nblocks >> 1;
- nloops = nloops << 1;
- } while (nblocks != 0);
-}
-
/**
* Do the permutation needed BEFORE calling ff_fft_calc()
*/
-void ff_fft_permute(FFTContext *s, FFTComplex *z)
+void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
{
int j, k, np;
FFTComplex tmp;
const uint16_t *revtab = s->revtab;
+ np = 1 << s->nbits;
+
+ if (s->tmp_buf) {
+ /* TODO: handle split-radix permute in a more optimal way, probably in-place */
+ for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+ memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
+ return;
+ }
/* reverse */
- np = 1 << s->nbits;
for(j=0;j<np;j++) {
k = revtab[j];
if (k < j) {
@@ -262,5 +218,138 @@ void ff_fft_end(FFTContext *s)
av_freep(&s->revtab);
av_freep(&s->exptab);
av_freep(&s->exptab1);
+ av_freep(&s->tmp_buf);
+}
+
+#define sqrthalf (float)M_SQRT1_2
+
+#define BF(x,y,a,b) {\
+ x = a - b;\
+ y = a + b;\
+}
+
+#define BUTTERFLIES(a0,a1,a2,a3) {\
+ BF(t3, t5, t5, t1);\
+ BF(a2.re, a0.re, a0.re, t5);\
+ BF(a3.im, a1.im, a1.im, t3);\
+ BF(t4, t6, t2, t6);\
+ BF(a3.re, a1.re, a1.re, t4);\
+ BF(a2.im, a0.im, a0.im, t6);\
+}
+
+// force loading all the inputs before storing any.
+// this is slightly slower for small data, but avoids store->load aliasing
+// for addresses separated by large powers of 2.
+#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
+ FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
+ BF(t3, t5, t5, t1);\
+ BF(a2.re, a0.re, r0, t5);\
+ BF(a3.im, a1.im, i1, t3);\
+ BF(t4, t6, t2, t6);\
+ BF(a3.re, a1.re, r1, t4);\
+ BF(a2.im, a0.im, i0, t6);\
+}
+
+#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
+ t1 = a2.re * wre + a2.im * wim;\
+ t2 = a2.im * wre - a2.re * wim;\
+ t5 = a3.re * wre - a3.im * wim;\
+ t6 = a3.im * wre + a3.re * wim;\
+ BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
+ t1 = a2.re;\
+ t2 = a2.im;\
+ t5 = a3.re;\
+ t6 = a3.im;\
+ BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+static void fft4(FFTComplex *z)
+{
+ FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
+
+ BF(t3, t1, z[0].re, z[1].re);
+ BF(t8, t6, z[3].re, z[2].re);
+ BF(z[2].re, z[0].re, t1, t6);
+ BF(t4, t2, z[0].im, z[1].im);
+ BF(t7, t5, z[2].im, z[3].im);
+ BF(z[3].im, z[1].im, t4, t8);
+ BF(z[3].re, z[1].re, t3, t7);
+ BF(z[2].im, z[0].im, t2, t5);
+}
+
+static void fft8(FFTComplex *z)
+{
+ FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
+
+ fft4(z);
+
+ BF(t1, z[5].re, z[4].re, -z[5].re);
+ BF(t2, z[5].im, z[4].im, -z[5].im);
+ BF(t3, z[7].re, z[6].re, -z[7].re);
+ BF(t4, z[7].im, z[6].im, -z[7].im);
+ BF(t8, t1, t3, t1);
+ BF(t7, t2, t2, t4);
+ BF(z[4].re, z[0].re, z[0].re, t1);
+ BF(z[4].im, z[0].im, z[0].im, t2);
+ BF(z[6].re, z[2].re, z[2].re, t7);
+ BF(z[6].im, z[2].im, z[2].im, t8);
+
+ TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
+}
+
+static void fft16(FFTComplex *z)
+{
+ FFTSample t1, t2, t3, t4, t5, t6;
+
+ fft8(z);
+ fft4(z+8);
+ fft4(z+12);
+
+ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
+ TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
+ TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]);
+ TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]);
+}
+
+/* z[0...8n-1], w[1...2n-1] */
+#define PASS(name)\
+static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
+{\
+ FFTSample t1, t2, t3, t4, t5, t6;\
+ int o1 = 2*n;\
+ int o2 = 4*n;\
+ int o3 = 6*n;\
+ const FFTSample *wim = wre+o1;\
+ n--;\
+\
+ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
+ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
+ do {\
+ z += 2;\
+ wre += 2;\
+ wim -= 2;\
+ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
+ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
+ } while(--n);\
+}
+
+PASS(pass)
+#undef BUTTERFLIES
+#define BUTTERFLIES BUTTERFLIES_BIG
+PASS(pass_big)
+
+#include "fft_template.h"
+
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init(). The
+ * input data must be permuted before with s->revtab table. No
+ * 1.0/sqrt(n) normalization is done.
+ */
+void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
+{
+ fft_dispatch[s->nbits-2](z);
}
diff --git a/libavcodec/fft_template.h b/libavcodec/fft_template.h
new file mode 100644
index 0000000..a4c54b0
--- /dev/null
+++ b/libavcodec/fft_template.h
@@ -0,0 +1,53 @@
+/**
+ * @file fft_template.h
+ * FFT/IFFT transforms
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define DECL_FFT(n,n2,n4,suffix)\
+static void fft##n##suffix(FFTComplex *z)\
+{\
+ fft##n2(z);\
+ fft##n4(z+n4*2);\
+ fft##n4(z+n4*3);\
+ pass##suffix(z,ff_cos_##n,n4/2);\
+}
+
+DECL_FFT(32,16,8,)
+DECL_FFT(64,32,16,)
+DECL_FFT(128,64,32,)
+DECL_FFT(256,128,64,)
+DECL_FFT(512,256,128,)
+#ifdef BUTTERFLIES_BIG
+#define pass pass_big
+#endif
+DECL_FFT(1024,512,256,)
+DECL_FFT(2048,1024,512,)
+DECL_FFT(4096,2048,1024,)
+DECL_FFT(8192,4096,2048,)
+DECL_FFT(16384,8192,4096,)
+DECL_FFT(32768,16384,8192,)
+DECL_FFT(65536,32768,16384,)
+#undef pass
+
+static void (*fft_dispatch[])(FFTComplex*) = {
+ fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
+ fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
+};
+
diff --git a/libavcodec/i386/fft_3dn.c b/libavcodec/i386/fft_3dn.c
index 8bd7b89..6f2e2e8 100644
--- a/libavcodec/i386/fft_3dn.c
+++ b/libavcodec/i386/fft_3dn.c
@@ -1,7 +1,6 @@
/*
* FFT/MDCT transform with 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
- * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2008 Loren Merritt
*
* This file is part of FFmpeg.
*
@@ -20,109 +19,5 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-
-static const int p1m1[2] __attribute__((aligned(8))) =
- { 0, 1 << 31 };
-
-static const int m1p1[2] __attribute__((aligned(8))) =
- { 1 << 31, 0 };
-
-void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
-{
- int ln = s->nbits;
- long j;
- x86_reg i;
- long nblocks, nloops;
- FFTComplex *p, *cptr;
-
- asm volatile(
- /* FEMMS is not a must here but recommended by AMD */
- "femms \n\t"
- "movq %0, %%mm7 \n\t"
- ::"m"(*(s->inverse ? m1p1 : p1m1))
- );
-
- i = 8 << ln;
- asm volatile(
- "1: \n\t"
- "sub $32, %0 \n\t"
- "movq (%0,%1), %%mm0 \n\t"
- "movq 16(%0,%1), %%mm1 \n\t"
- "movq 8(%0,%1), %%mm2 \n\t"
- "movq 24(%0,%1), %%mm3 \n\t"
- "movq %%mm0, %%mm4 \n\t"
- "movq %%mm1, %%mm5 \n\t"
- "pfadd %%mm2, %%mm0 \n\t"
- "pfadd %%mm3, %%mm1 \n\t"
- "pfsub %%mm2, %%mm4 \n\t"
- "pfsub %%mm3, %%mm5 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "punpckldq %%mm5, %%mm6 \n\t"
- "punpckhdq %%mm6, %%mm5 \n\t"
- "movq %%mm4, %%mm3 \n\t"
- "pxor %%mm7, %%mm5 \n\t"
- "pfadd %%mm1, %%mm0 \n\t"
- "pfadd %%mm5, %%mm4 \n\t"
- "pfsub %%mm1, %%mm2 \n\t"
- "pfsub %%mm5, %%mm3 \n\t"
- "movq %%mm0, (%0,%1) \n\t"
- "movq %%mm4, 8(%0,%1) \n\t"
- "movq %%mm2, 16(%0,%1) \n\t"
- "movq %%mm3, 24(%0,%1) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(z)
- );
- /* pass 2 .. ln-1 */
-
- nblocks = 1 << (ln-3);
- nloops = 1 << 2;
- cptr = s->exptab1;
- do {
- p = z;
- j = nblocks;
- do {
- i = nloops*8;
- asm volatile(
- "1: \n\t"
- "sub $16, %0 \n\t"
- "movq (%1,%0), %%mm0 \n\t"
- "movq 8(%1,%0), %%mm1 \n\t"
- "movq (%2,%0), %%mm2 \n\t"
- "movq 8(%2,%0), %%mm3 \n\t"
- "movq %%mm2, %%mm4 \n\t"
- "movq %%mm3, %%mm5 \n\t"
- "punpckldq %%mm2, %%mm2 \n\t"
- "punpckldq %%mm3, %%mm3 \n\t"
- "punpckhdq %%mm4, %%mm4 \n\t"
- "punpckhdq %%mm5, %%mm5 \n\t"
- "pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re
- "pfmul 8(%3,%0,2), %%mm3 \n\t"
- "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
- "pfmul 24(%3,%0,2), %%mm5 \n\t"
- "pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
- "pfadd %%mm3, %%mm5 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "pfadd %%mm4, %%mm0 \n\t"
- "pfadd %%mm5, %%mm1 \n\t"
- "pfsub %%mm4, %%mm2 \n\t"
- "pfsub %%mm5, %%mm3 \n\t"
- "movq %%mm0, (%1,%0) \n\t"
- "movq %%mm1, 8(%1,%0) \n\t"
- "movq %%mm2, (%2,%0) \n\t"
- "movq %%mm3, 8(%2,%0) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(p), "r"(p + nloops), "r"(cptr)
- );
- p += nloops*2;
- } while (--j);
- cptr += nloops*2;
- nblocks >>= 1;
- nloops <<= 1;
- } while (nblocks != 0);
- asm volatile("femms");
-}
+#define EMULATE_3DNOWEXT
+#include "fft_3dn2.c"
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c
index 9068dff..42ac691 100644
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -1,7 +1,6 @@
/*
* FFT/MDCT transform with Extended 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
- * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt
*
* This file is part of FFmpeg.
*
@@ -23,105 +22,209 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
-static const int p1m1[2] __attribute__((aligned(8))) =
- { 0, 1 << 31 };
+DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
+DECLARE_ALIGNED_8(static const int, m1p1[2]) = { 1<<31, 0 };
+DECLARE_ALIGNED_8(static const FFTSample, root2[2]) = { M_SQRT1_2, M_SQRT1_2 };
-static const int m1p1[2] __attribute__((aligned(8))) =
- { 1 << 31, 0 };
+#ifdef EMULATE_3DNOWEXT
+#define PSWAPD(s,d)\
+ "movq "#s","#d"\n"\
+ "psrlq $32,"#d"\n"\
+ "punpckldq "#s","#d"\n"
+#define PSWAPD_UNARY(s)\
+ "sub $8, %%"REG_SP"\n"\
+ "movd "#s", 4(%%"REG_SP")\n"\
+ "punpckhdq (%%"REG_SP"), "#s"\n"\
+ "add $8, %%"REG_SP"\n"
+#define ff_fft_calc_3dn2 ff_fft_calc_3dn
+#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
+#define ff_imdct_half_3dn2 ff_imdct_half_3dn
+#else
+#define PSWAPD(s,d) "pswapd "#s","#d"\n"
+#define PSWAPD_UNARY(s) "pswapd "#s","#s"\n"
+#endif
-void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
+#define T2(m0,m1,z0,z1)\
+ asm volatile(\
+ "movq %0, "#z0" \n"\
+ "movq "#z0", "#z1" \n"\
+ "pfadd %1, "#z0" \n"\
+ "pfsub %1, "#z1" \n"\
+ ::"m"(m0), "m"(m1)\
+ );
+
+#define T4(z0,z1,z2,z3,t0,t1)\
+ asm volatile(\
+ "movq "#z2", "#t1" \n" /* FIXME redundant */\
+ "movq "#z2", "#t0" \n" /* {r2,i2} */\
+ "pfsub "#z3", "#t1" \n"\
+ "pfadd "#z3", "#t0" \n" /* {t6,t5} */\
+ "pxor %0, "#t1" \n" /* {t8,t7} */\
+ "movq "#z0", "#z2" \n"\
+ PSWAPD_UNARY(t1)\
+ "pfadd "#t0", "#z0" \n" /* {r0,i0} */\
+ "pfsub "#t0", "#z2" \n" /* {r2,i2} */\
+ "movq "#z1", "#z3" \n"\
+ "pfadd "#t1", "#z1" \n" /* {r1,i1} */\
+ "pfsub "#t1", "#z3" \n" /* {r3,i3} */\
+ ::"m"(*m1p1)\
+ );
+
+#define LOAD(mem,mmx)\
+ asm volatile("movq %0, "#mmx ::"m"(mem))
+
+#define SAVE(mem,mmx)\
+ asm volatile("movq "#mmx", %0" :"=m"(mem))
+
+#define PUNPCK(a,b,c)\
+ asm volatile(\
+ "movq "#a", "#c"\n"\
+ "punpckldq "#b", "#a"\n"\
+ "punpckhdq "#b", "#c"\n"\
+ :);
+
+#define PUNPCK_MEM(a,b,c)\
+ asm volatile(\
+ "movq "#a", "#c"\n"\
+ "punpckldq %0, "#a"\n"\
+ "punpckhdq %0, "#c"\n"\
+ ::"m"(b)\
+ );
+
+static void fft4(FFTComplex *z)
{
- int ln = s->nbits;
- long j;
- x86_reg i;
- long nblocks, nloops;
- FFTComplex *p, *cptr;
+ T2(z[0], z[1], %%mm0, %%mm1);
+ LOAD(z[2], %%mm2);
+ LOAD(z[3], %%mm3);
+ T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
+ PUNPCK(%%mm0, %%mm1, %%mm4);
+ PUNPCK(%%mm2, %%mm3, %%mm5);
+ SAVE(z[0], %%mm0);
+ SAVE(z[1], %%mm4);
+ SAVE(z[2], %%mm2);
+ SAVE(z[3], %%mm5);
+}
+static void fft8(FFTComplex *z)
+{
+ T2(z[0], z[1], %%mm0, %%mm1);
+ LOAD(z[2], %%mm2);
+ LOAD(z[3], %%mm3);
+ T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
+ SAVE(z[0], %%mm0);
+ SAVE(z[2], %%mm2);
+ T2(z[4], z[5], %%mm4, %%mm5);
+ T2(z[6], z[7], %%mm6, %%mm7);
asm volatile(
- /* FEMMS is not a must here but recommended by AMD */
- "femms \n\t"
- "movq %0, %%mm7 \n\t"
- ::"m"(*(s->inverse ? m1p1 : p1m1))
+ PSWAPD( %%mm5, %%mm0 )
+ PSWAPD( %%mm7, %%mm2 )
+ "pxor %0, %%mm0 \n"
+ "pxor %0, %%mm2 \n"
+ "pfsub %%mm0, %%mm5 \n"
+ "pfadd %%mm2, %%mm7 \n"
+ "pfmul %1, %%mm5 \n"
+ "pfmul %1, %%mm7 \n"
+ ::"m"(*m1p1), "m"(*root2)
);
+ T4(%%mm1, %%mm3, %%mm5, %%mm7, %%mm0, %%mm2);
+ SAVE(z[5], %%mm5);
+ SAVE(z[7], %%mm7);
+ LOAD(z[0], %%mm0);
+ LOAD(z[2], %%mm2);
+ T4(%%mm0, %%mm2, %%mm4, %%mm6, %%mm5, %%mm7);
+ PUNPCK(%%mm0, %%mm1, %%mm5);
+ PUNPCK(%%mm2, %%mm3, %%mm7);
+ SAVE(z[0], %%mm0);
+ SAVE(z[1], %%mm5);
+ SAVE(z[2], %%mm2);
+ SAVE(z[3], %%mm7);
+ PUNPCK_MEM(%%mm4, z[5], %%mm5);
+ PUNPCK_MEM(%%mm6, z[7], %%mm7);
+ SAVE(z[4], %%mm4);
+ SAVE(z[5], %%mm5);
+ SAVE(z[6], %%mm6);
+ SAVE(z[7], %%mm7);
+}
- i = 8 << ln;
+/* z[0...8n-1], w[1...2n-1] */
+static av_noinline void pass(FFTComplex *z, const FFTSample *w, uintptr_t n)
+{
+ // FIXME: AMD has the same store->load aliasing issue as Intel, just with
+ // a larger threshold (64KB rather than 4KB). This is high enough that I
+ // haven't bothered to make two versions of pass(), but it would be faster
+ // if someone cares about FFT of size >=2^14.
asm volatile(
- "1: \n\t"
- "sub $32, %0 \n\t"
- "movq (%0,%1), %%mm0 \n\t"
- "movq 16(%0,%1), %%mm1 \n\t"
- "movq 8(%0,%1), %%mm2 \n\t"
- "movq 24(%0,%1), %%mm3 \n\t"
- "movq %%mm0, %%mm4 \n\t"
- "movq %%mm1, %%mm5 \n\t"
- "pfadd %%mm2, %%mm0 \n\t"
- "pfadd %%mm3, %%mm1 \n\t"
- "pfsub %%mm2, %%mm4 \n\t"
- "pfsub %%mm3, %%mm5 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "pswapd %%mm5, %%mm5 \n\t"
- "movq %%mm4, %%mm3 \n\t"
- "pxor %%mm7, %%mm5 \n\t"
- "pfadd %%mm1, %%mm0 \n\t"
- "pfadd %%mm5, %%mm4 \n\t"
- "pfsub %%mm1, %%mm2 \n\t"
- "pfsub %%mm5, %%mm3 \n\t"
- "movq %%mm0, (%0,%1) \n\t"
- "movq %%mm4, 8(%0,%1) \n\t"
- "movq %%mm2, 16(%0,%1) \n\t"
- "movq %%mm3, 24(%0,%1) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(z)
+ "1: \n"
+ "movq (%0,%3,4), %%mm4 \n" // r2
+ "movq 8(%0,%3,4), %%mm5 \n" // i2
+ "movq %%mm4, %%mm2 \n"
+ "movq (%1), %%mm0 \n" // wre
+ "movq %%mm5, %%mm3 \n"
+ "movq (%1,%3), %%mm1 \n" // wim
+ "pfmul %%mm0, %%mm2 \n" // r2*wre
+ "movq (%0,%4), %%mm6 \n" // r3
+ "pfmul %%mm1, %%mm3 \n" // i2*wim
+ "movq 8(%0,%4), %%mm7 \n" // i3
+ "pfmul %%mm1, %%mm4 \n" // r2*wim
+ "pfmul %%mm0, %%mm5 \n" // i2*wre
+ "pfadd %%mm3, %%mm2 \n" // r2*wre + i2*wim
+ "movq %%mm1, %%mm3 \n"
+ "pfmul %%mm6, %%mm1 \n" // r3*wim
+ "pfsub %%mm4, %%mm5 \n" // i2*wre - r2*wim
+ "movq %%mm0, %%mm4 \n"
+ "pfmul %%mm7, %%mm3 \n" // i3*wim
+ "pfmul %%mm6, %%mm4 \n" // r3*wre
+ "pfmul %%mm7, %%mm0 \n" // i3*wre
+ "pfsub %%mm3, %%mm4 \n" // r3*wre - i3*wim
+ "movq (%0), %%mm3 \n"
+ "pfadd %%mm1, %%mm0 \n" // i3*wre + r3*wim
+ "movq %%mm4, %%mm1 \n"
+ "pfadd %%mm2, %%mm4 \n" // t5
+ "pfsub %%mm2, %%mm1 \n" // t3
+ "pfsub %%mm4, %%mm3 \n" // r2
+ "movq %%mm3, (%0,%3,4) \n"
+ "pfadd (%0), %%mm4 \n" // r0
+ "movq %%mm4, (%0) \n"
+ "movq %%mm5, %%mm4 \n"
+ "pfadd %%mm0, %%mm5 \n" // t6
+ "movq 8(%0,%3,2), %%mm3 \n"
+ "pfsub %%mm1, %%mm3 \n" // i3
+ "pfsub %%mm0, %%mm4 \n" // t4
+ "movq %%mm3, 8(%0,%4) \n"
+ "movq (%0,%3,2), %%mm3 \n"
+ "pfadd 8(%0,%3,2), %%mm1 \n" // i1
+ "pfsub %%mm4, %%mm3 \n" // r3
+ "movq %%mm1, 8(%0,%3,2) \n"
+ "movq %%mm3, (%0,%4) \n"
+ "pfadd (%0,%3,2), %%mm4 \n" // r1
+ "movq 8(%0), %%mm3 \n"
+ "movq %%mm4, (%0,%3,2) \n"
+ "pfsub %%mm5, %%mm3 \n" // i2
+ "pfadd 8(%0), %%mm5 \n" // i0
+ "movq %%mm3, 8(%0,%3,4) \n"
+ "movq %%mm5, 8(%0) \n"
+ "add $16, %0 \n"
+ "add $8, %1 \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ :"+r"(z), "+r"(w), "+r"(n)
+ :"r"(n*8), "r"(n*48)
+ :"memory"
);
- /* pass 2 .. ln-1 */
+}
+
+static void fft16(FFTComplex *z);
+#include "../fft_template.h"
+DECL_FFT(16,8,4,)
- nblocks = 1 << (ln-3);
- nloops = 1 << 2;
- cptr = s->exptab1;
- do {
- p = z;
- j = nblocks;
- do {
- i = nloops*8;
- asm volatile(
- "1: \n\t"
- "sub $16, %0 \n\t"
- "movq (%1,%0), %%mm0 \n\t"
- "movq 8(%1,%0), %%mm1 \n\t"
- "movq (%2,%0), %%mm2 \n\t"
- "movq 8(%2,%0), %%mm3 \n\t"
- "movq (%3,%0,2), %%mm4 \n\t"
- "movq 8(%3,%0,2), %%mm5 \n\t"
- "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
- "pswapd %%mm5, %%mm7 \n\t"
- "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im
- "pfmul %%mm3, %%mm5 \n\t"
- "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im
- "pfmul %%mm3, %%mm7 \n\t"
- "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
- "pfpnacc %%mm7, %%mm5 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm3 \n\t"
- "pfadd %%mm4, %%mm0 \n\t"
- "pfadd %%mm5, %%mm1 \n\t"
- "pfsub %%mm4, %%mm2 \n\t"
- "pfsub %%mm5, %%mm3 \n\t"
- "movq %%mm0, (%1,%0) \n\t"
- "movq %%mm1, 8(%1,%0) \n\t"
- "movq %%mm2, (%2,%0) \n\t"
- "movq %%mm3, 8(%2,%0) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(p), "r"(p + nloops), "r"(cptr)
- );
- p += nloops*2;
- } while (--j);
- cptr += nloops*2;
- nblocks >>= 1;
- nloops <<= 1;
- } while (nblocks != 0);
+void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
+{
+ int n = 1<<s->nbits;
+ int i;
+ fft_dispatch[s->nbits-2](z);
asm volatile("femms");
+ for(i=0; i<n; i+=2)
+ FFSWAP(FFTSample, z[i].im, z[i+1].re);
}
static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c
index 305f44a..32b05d4 100644
--- a/libavcodec/i386/fft_sse.c
+++ b/libavcodec/i386/fft_sse.c
@@ -1,6 +1,6 @@
/*
* FFT/MDCT transform with SSE optimizations
- * Copyright (c) 2002 Fabrice Bellard.
+ * Copyright (c) 2008 Loren Merritt
*
* This file is part of FFmpeg.
*
@@ -22,124 +22,368 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
-static const int p1p1p1m1[4] __attribute__((aligned(16))) =
- { 0, 0, 0, 1 << 31 };
+DECLARE_ALIGNED_16(static const FFTSample, root2[4]) = { M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2};
+DECLARE_ALIGNED_16(static const FFTSample, root2mppm[4]) = {-M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2};
+DECLARE_ALIGNED_16(static const int, m1m1m1m1[4]) = { 1<<31, 1<<31, 1<<31, 1<<31 };
+DECLARE_ALIGNED_16(static const int, p1m1p1m1[4]) = { 0, 1<<31, 0, 1<<31 };
-static const int p1p1m1p1[4] __attribute__((aligned(16))) =
- { 0, 0, 1 << 31, 0 };
+// in: r0={r0,i0,r1,i1} t0={r2,i2,r3,i3}
+// out: r0={r0,r1,r2,r3} i0={i0,i1,i2,i3}
+#define T4(r0,i0,t0)\
+ asm("movaps "#r0", "#i0" \n"\
+ "shufps $0x64, "#t0", "#r0" \n" /* {r0,i0,r3,i2} */\
+ "shufps $0xce, "#t0", "#i0" \n" /* {r1,i1,r2,i3} */\
+ "movaps "#r0", "#t0" \n"\
+ "addps "#i0", "#r0" \n" /* {t1,t2,t6,t5} */\
+ "subps "#i0", "#t0" \n" /* {t3,t4,t8,t7} */\
+ "movaps "#r0", "#i0" \n"\
+ "shufps $0x44, "#t0", "#r0" \n" /* {t1,t2,t3,t4} */\
+ "shufps $0xbe, "#t0", "#i0" \n" /* {t6,t5,t7,t8} */\
+ "movaps "#r0", "#t0" \n"\
+ "addps "#i0", "#r0" \n" /* {r0,i0,r1,i1} */\
+ "subps "#i0", "#t0" \n" /* {r2,i2,r3,i3} */\
+ "movaps "#r0", "#i0" \n"\
+ "shufps $0x88, "#t0", "#r0" \n" /* {r0,r1,r2,r3} */\
+ "shufps $0xdd, "#t0", "#i0" \n" /* {i0,i1,i2,i3} */\
+ :\
+ );
-static const int p1p1m1m1[4] __attribute__((aligned(16))) =
- { 0, 0, 1 << 31, 1 << 31 };
+#define T8(r0,i0,r1,i1,t0,t1)\
+ asm("movaps "#r1", "#t0" \n"\
+ "shufps $0x44, "#i1", "#r1" \n" /* {r4,i4,r6,i6} */\
+ "shufps $0xee, "#i1", "#t0" \n" /* {r5,i5,r7,i7} */\
+ "movaps "#r1", "#t1" \n"\
+ "subps "#t0", "#r1" \n" /* {r5,i5,r7,i7} */\
+ "addps "#t0", "#t1" \n" /* {t1,t2,t3,t4} */\
+ "movaps "#r1", "#t0" \n"\
+ "shufps $0xb1, "#t0", "#t0" \n" /* {i5,r5,i7,r7} */\
+ "mulps %1, "#r1" \n" /* {-r5,i5,r7,-i7} */\
+ "mulps %0, "#t0" \n"\
+ "addps "#t0", "#r1" \n" /* {t8,t7,ta,t9} */\
+ "movaps "#t1", "#t0" \n"\
+ "shufps $0x36, "#r1", "#t1" \n" /* {t3,t2,t9,t8} */\
+ "shufps $0x9c, "#r1", "#t0" \n" /* {t1,t4,t7,ta} */\
+ "movaps "#t1", "#r1" \n"\
+ "addps "#t0", "#t1" \n" /* {t1,t2,t9,ta} */\
+ "subps "#t0", "#r1" \n" /* {t6,t5,tc,tb} */\
+ "movaps "#t1", "#t0" \n"\
+ "shufps $0xd8, "#r1", "#t1" \n" /* {t1,t9,t5,tb} */\
+ "shufps $0x8d, "#r1", "#t0" \n" /* {t2,ta,t6,tc} */\
+ "movaps "#r0", "#r1" \n"\
+ "movaps "#i0", "#i1" \n"\
+ "addps "#t1", "#r0" \n" /* {r0,r1,r2,r3} */\
+ "addps "#t0", "#i0" \n" /* {i0,i1,i2,i3} */\
+ "subps "#t1", "#r1" \n" /* {r4,r5,r6,r7} */\
+ "subps "#t0", "#i1" \n" /* {i4,i5,i6,i7} */\
+ ::"m"(*root2), "m"(*root2mppm)\
+ );
-static const int p1m1p1m1[4] __attribute__((aligned(16))) =
- { 0, 1 << 31, 0, 1 << 31 };
+#define LOAD(mem,xmm)\
+ asm("movaps %0, "#xmm ::"m"(mem))
-static const int m1m1m1m1[4] __attribute__((aligned(16))) =
- { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
+#define SAVE(mem,xmm)\
+ asm("movaps "#xmm", %0" :"=m"(mem))
-#if 0
-static void print_v4sf(const char *str, __m128 a)
+static void fft4(FFTComplex *z)
{
- float *p = (float *)&a;
- printf("%s: %f %f %f %f\n",
- str, p[0], p[1], p[2], p[3]);
+ LOAD(z[0], %%xmm0);
+ LOAD(z[2], %%xmm2);
+ T4(%%xmm0, %%xmm1, %%xmm2);
+ SAVE(z[0], %%xmm0);
+ SAVE(z[2], %%xmm1);
}
-#endif
-/* XXX: handle reverse case */
-void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
+static void fft8(FFTComplex *z)
{
- int ln = s->nbits;
- x86_reg i;
- long j;
- long nblocks, nloops;
- FFTComplex *p, *cptr;
+ LOAD(z[0], %%xmm0);
+ LOAD(z[2], %%xmm2);
+ T4(%%xmm0, %%xmm1, %%xmm2);
+ LOAD(z[4], %%xmm2);
+ LOAD(z[6], %%xmm3);
+ T8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5);
+ SAVE(z[0], %%xmm0);
+ SAVE(z[2], %%xmm1);
+ SAVE(z[4], %%xmm2);
+ SAVE(z[6], %%xmm3);
+}
+static void fft16(FFTComplex *z)
+{
+ LOAD(z[0], %%xmm0);
+ LOAD(z[2], %%xmm2);
+ T4(%%xmm0, %%xmm1, %%xmm2);
+ LOAD(z[4], %%xmm2);
+ LOAD(z[6], %%xmm3);
+ T8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5);
+ LOAD(z[8], %%xmm4);
+ LOAD(z[10], %%xmm6);
+ SAVE(z[0], %%xmm0);
+ SAVE(z[2], %%xmm1);
+ SAVE(z[4], %%xmm2);
+ SAVE(z[6], %%xmm3);
+ T4(%%xmm4, %%xmm5, %%xmm6);
+ LOAD(z[12], %%xmm6);
+ LOAD(z[14], %%xmm0);
+ T4(%%xmm6, %%xmm7, %%xmm0);
+
+ asm("movaps %1, %%xmm0 \n" // wre
+ "movaps %%xmm4, %%xmm2 \n"
+ "movaps 16+%1, %%xmm1 \n" // wim
+ "movaps %%xmm5, %%xmm3 \n"
+ "mulps %%xmm0, %%xmm2 \n" // r2*wre
+ "mulps %%xmm1, %%xmm3 \n" // i2*wim
+ "mulps %%xmm1, %%xmm4 \n" // r2*wim
+ "mulps %%xmm0, %%xmm5 \n" // i2*wre
+ "addps %%xmm3, %%xmm2 \n" // r2*wre + i2*wim
+ "movaps %%xmm1, %%xmm3 \n"
+ "mulps %%xmm6, %%xmm1 \n" // r3*wim
+ "subps %%xmm4, %%xmm5 \n" // i2*wre - r2*wim
+ "movaps %%xmm0, %%xmm4 \n"
+ "mulps %%xmm7, %%xmm3 \n" // i3*wim
+ "mulps %%xmm6, %%xmm4 \n" // r3*wre
+ "mulps %%xmm7, %%xmm0 \n" // i3*wre
+ "subps %%xmm3, %%xmm4 \n" // r3*wre - i3*wim
+ "movaps 0x00(%0), %%xmm3 \n"
+ "addps %%xmm1, %%xmm0 \n" // i3*wre + r3*wim
+ "movaps %%xmm4, %%xmm1 \n"
+ "addps %%xmm2, %%xmm4 \n" // t5
+ "subps %%xmm2, %%xmm1 \n" // t3
+ "subps %%xmm4, %%xmm3 \n" // r2
+ "addps 0x00(%0), %%xmm4 \n" // r0
+ "movaps 0x20(%0), %%xmm6 \n"
+ "movaps %%xmm3, 0x40(%0) \n"
+ "movaps %%xmm4, 0x00(%0) \n"
+ "movaps %%xmm5, %%xmm3 \n"
+ "subps %%xmm0, %%xmm5 \n" // t4
+ "movaps %%xmm6, %%xmm4 \n"
+ "subps %%xmm5, %%xmm6 \n" // r3
+ "addps %%xmm4, %%xmm5 \n" // r1
+ "movaps %%xmm6, 0x60(%0) \n"
+ "movaps %%xmm5, 0x20(%0) \n"
+ "movaps 0x30(%0), %%xmm2 \n"
+ "addps %%xmm0, %%xmm3 \n" // t6
+ "subps %%xmm1, %%xmm2 \n" // i3
+ "movaps 0x10(%0), %%xmm7 \n"
+ "addps 0x30(%0), %%xmm1 \n" // i1
+ "movaps %%xmm2, 0x70(%0) \n"
+ "movaps %%xmm1, 0x30(%0) \n"
+ "movaps %%xmm7, %%xmm4 \n"
+ "subps %%xmm3, %%xmm7 \n" // i2
+ "addps %%xmm4, %%xmm3 \n" // i0
+ "movaps %%xmm7, 0x50(%0) \n"
+ "movaps %%xmm3, 0x10(%0) \n"
+ ::"r"(z), "m"(*ff_cos_16)
+ :"memory"
+ );
+}
+
+/* z[0...8n-1], w[1...2n-1] */
+static av_noinline void pass(FFTComplex *z, const FFTSample *w, uintptr_t n)
+{
asm volatile(
- "movaps %0, %%xmm4 \n\t"
- "movaps %1, %%xmm5 \n\t"
- ::"m"(*p1p1m1m1),
- "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
+ "1: \n"
+ "movaps (%0,%3,4), %%xmm4 \n" // r2
+ "movaps 16(%0,%3,4), %%xmm5 \n" // i2
+ "movaps %%xmm4, %%xmm2 \n"
+ "movaps (%1), %%xmm0 \n" // wre
+ "movaps %%xmm5, %%xmm3 \n"
+ "movaps (%1,%3), %%xmm1 \n" // wim
+ "mulps %%xmm0, %%xmm2 \n" // r2*wre
+ "movaps (%0,%4), %%xmm6 \n" // r3
+ "mulps %%xmm1, %%xmm3 \n" // i2*wim
+ "movaps 16(%0,%4), %%xmm7 \n" // i3
+ "mulps %%xmm1, %%xmm4 \n" // r2*wim
+ "mulps %%xmm0, %%xmm5 \n" // i2*wre
+ "addps %%xmm3, %%xmm2 \n" // r2*wre + i2*wim
+ "movaps %%xmm1, %%xmm3 \n"
+ "mulps %%xmm6, %%xmm1 \n" // r3*wim
+ "subps %%xmm4, %%xmm5 \n" // i2*wre - r2*wim
+ "movaps %%xmm0, %%xmm4 \n"
+ "mulps %%xmm7, %%xmm3 \n" // i3*wim
+ "mulps %%xmm6, %%xmm4 \n" // r3*wre
+ "mulps %%xmm7, %%xmm0 \n" // i3*wre
+ "subps %%xmm3, %%xmm4 \n" // r3*wre - i3*wim
+ "movaps (%0), %%xmm3 \n"
+ "addps %%xmm1, %%xmm0 \n" // i3*wre + r3*wim
+ "movaps %%xmm4, %%xmm1 \n"
+ "addps %%xmm2, %%xmm4 \n" // t5
+ "subps %%xmm2, %%xmm1 \n" // t3
+ "subps %%xmm4, %%xmm3 \n" // r2
+ "addps (%0), %%xmm4 \n" // r0
+ "movaps (%0,%3,2), %%xmm6 \n"
+ "movaps %%xmm3, (%0,%3,4) \n"
+ "movaps %%xmm4, (%0) \n"
+ "movaps %%xmm5, %%xmm3 \n"
+ "subps %%xmm0, %%xmm5 \n" // t4
+ "movaps %%xmm6, %%xmm4 \n"
+ "subps %%xmm5, %%xmm6 \n" // r3
+ "addps %%xmm4, %%xmm5 \n" // r1
+ "movaps %%xmm6, (%0,%4) \n"
+ "movaps %%xmm5, (%0,%3,2) \n"
+ "movaps 16(%0,%3,2), %%xmm2 \n"
+ "addps %%xmm0, %%xmm3 \n" // t6
+ "subps %%xmm1, %%xmm2 \n" // i3
+ "movaps 16(%0), %%xmm7 \n"
+ "addps 16(%0,%3,2), %%xmm1 \n" // i1
+ "movaps %%xmm2, 16(%0,%4) \n"
+ "movaps %%xmm1, 16(%0,%3,2) \n"
+ "movaps %%xmm7, %%xmm4 \n"
+ "subps %%xmm3, %%xmm7 \n" // i2
+ "addps %%xmm4, %%xmm3 \n" // i0
+ "movaps %%xmm7, 16(%0,%3,4) \n"
+ "movaps %%xmm3, 16(%0) \n"
+ "add $32, %0 \n"
+ "add $16, %1 \n"
+ "sub $2, %2 \n"
+ "jg 1b \n"
+ :"+r"(z), "+r"(w), "+r"(n)
+ :"r"(n*8), "r"(n*48)
+ :"memory"
);
+}
- i = 8 << ln;
+static av_noinline void pass_interleave(FFTComplex *z, const FFTSample *w, uintptr_t n)
+{
asm volatile(
- "1: \n\t"
- "sub $32, %0 \n\t"
- /* do the pass 0 butterfly */
- "movaps (%0,%1), %%xmm0 \n\t"
- "movaps %%xmm0, %%xmm1 \n\t"
- "shufps $0x4E, %%xmm0, %%xmm0 \n\t"
- "xorps %%xmm4, %%xmm1 \n\t"
- "addps %%xmm1, %%xmm0 \n\t"
- "movaps 16(%0,%1), %%xmm2 \n\t"
- "movaps %%xmm2, %%xmm3 \n\t"
- "shufps $0x4E, %%xmm2, %%xmm2 \n\t"
- "xorps %%xmm4, %%xmm3 \n\t"
- "addps %%xmm3, %%xmm2 \n\t"
- /* multiply third by -i */
- /* by toggling the sign bit */
- "shufps $0xB4, %%xmm2, %%xmm2 \n\t"
- "xorps %%xmm5, %%xmm2 \n\t"
- /* do the pass 1 butterfly */
- "movaps %%xmm0, %%xmm1 \n\t"
- "addps %%xmm2, %%xmm0 \n\t"
- "subps %%xmm2, %%xmm1 \n\t"
- "movaps %%xmm0, (%0,%1) \n\t"
- "movaps %%xmm1, 16(%0,%1) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(z)
+ "1: \n"
+ "movaps (%0,%3,4), %%xmm4 \n" // r2
+ "movaps 16(%0,%3,4), %%xmm5 \n" // i2
+ "movaps %%xmm4, %%xmm2 \n"
+ "movaps (%1), %%xmm0 \n" // wre
+ "movaps %%xmm5, %%xmm3 \n"
+ "movaps (%1,%3), %%xmm1 \n" // wim
+ "mulps %%xmm0, %%xmm2 \n" // r2*wre
+ "movaps (%0,%4), %%xmm6 \n" // r3
+ "mulps %%xmm1, %%xmm3 \n" // i2*wim
+ "movaps 16(%0,%4), %%xmm7 \n" // i3
+ "mulps %%xmm1, %%xmm4 \n" // r2*wim
+ "mulps %%xmm0, %%xmm5 \n" // i2*wre
+ "addps %%xmm3, %%xmm2 \n" // r2*wre + i2*wim
+ "movaps %%xmm1, %%xmm3 \n"
+ "mulps %%xmm6, %%xmm1 \n" // r3*wim
+ "subps %%xmm4, %%xmm5 \n" // i2*wre - r2*wim
+ "movaps %%xmm0, %%xmm4 \n"
+ "mulps %%xmm7, %%xmm3 \n" // i3*wim
+ "mulps %%xmm6, %%xmm4 \n" // r3*wre
+ "mulps %%xmm7, %%xmm0 \n" // i3*wre
+ "subps %%xmm3, %%xmm4 \n" // r3*wre - i3*wim
+ "movaps (%0), %%xmm3 \n"
+ "addps %%xmm1, %%xmm0 \n" // i3*wre + r3*wim
+ "movaps %%xmm4, %%xmm1 \n"
+ "addps %%xmm2, %%xmm4 \n" // t5
+ "subps %%xmm2, %%xmm1 \n" // t3
+ "subps %%xmm4, %%xmm3 \n" // r2
+ "addps (%0), %%xmm4 \n" // r0
+ "movaps (%0,%3,2), %%xmm6 \n"
+ "movaps %%xmm3, (%0,%3,4) \n"
+ "movaps %%xmm4, (%0) \n"
+ "movaps %%xmm5, %%xmm3 \n"
+ "subps %%xmm0, %%xmm5 \n" // t4
+ "movaps %%xmm6, %%xmm4 \n"
+ "subps %%xmm5, %%xmm6 \n" // r3
+ "addps %%xmm4, %%xmm5 \n" // r1
+ "movaps 16(%0,%3,2), %%xmm2 \n"
+ "addps %%xmm0, %%xmm3 \n" // t6
+ "subps %%xmm1, %%xmm2 \n" // i3
+ "movaps 16(%0), %%xmm7 \n"
+ "addps 16(%0,%3,2), %%xmm1 \n" // i1
+ "movaps %%xmm7, %%xmm4 \n"
+ "subps %%xmm3, %%xmm7 \n" // i2
+ "addps %%xmm4, %%xmm3 \n" // i0
+ "movaps %%xmm5, %%xmm4 \n" // r1
+ "movaps %%xmm6, %%xmm0 \n" // r3
+ "unpcklps %%xmm1, %%xmm5 \n"
+ "unpckhps %%xmm1, %%xmm4 \n"
+ "unpcklps %%xmm2, %%xmm6 \n"
+ "unpckhps %%xmm2, %%xmm0 \n"
+ "movaps (%0), %%xmm1 \n"
+ "movaps (%0,%3,4), %%xmm2 \n"
+ "movaps %%xmm5, (%0,%3,2) \n"
+ "movaps %%xmm4, 16(%0,%3,2) \n"
+ "movaps %%xmm6, (%0,%4) \n"
+ "movaps %%xmm0, 16(%0,%4) \n"
+ "movaps %%xmm1, %%xmm5 \n" // r0
+ "movaps %%xmm2, %%xmm4 \n" // r2
+ "unpcklps %%xmm3, %%xmm1 \n"
+ "unpckhps %%xmm3, %%xmm5 \n"
+ "unpcklps %%xmm7, %%xmm2 \n"
+ "unpckhps %%xmm7, %%xmm4 \n"
+ "movaps %%xmm1, (%0) \n"
+ "movaps %%xmm5, 16(%0) \n"
+ "movaps %%xmm2, (%0,%3,4) \n"
+ "movaps %%xmm4, 16(%0,%3,4) \n"
+ "add $32, %0 \n"
+ "add $16, %1 \n"
+ "sub $2, %2 \n"
+ "jg 1b \n"
+ :"+r"(z), "+r"(w), "+r"(n)
+ :"r"(n*8), "r"(n*48)
+ :"memory"
);
- /* pass 2 .. ln-1 */
-
- nblocks = 1 << (ln-3);
- nloops = 1 << 2;
- cptr = s->exptab1;
- do {
- p = z;
- j = nblocks;
- do {
- i = nloops*8;
- asm volatile(
- "1: \n\t"
- "sub $32, %0 \n\t"
- "movaps (%2,%0), %%xmm1 \n\t"
- "movaps (%1,%0), %%xmm0 \n\t"
- "movaps 16(%2,%0), %%xmm5 \n\t"
- "movaps 16(%1,%0), %%xmm4 \n\t"
- "movaps %%xmm1, %%xmm2 \n\t"
- "movaps %%xmm5, %%xmm6 \n\t"
- "shufps $0xA0, %%xmm1, %%xmm1 \n\t"
- "shufps $0xF5, %%xmm2, %%xmm2 \n\t"
- "shufps $0xA0, %%xmm5, %%xmm5 \n\t"
- "shufps $0xF5, %%xmm6, %%xmm6 \n\t"
- "mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re
- "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
- "mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re
- "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
- "addps %%xmm2, %%xmm1 \n\t"
- "addps %%xmm6, %%xmm5 \n\t"
- "movaps %%xmm0, %%xmm3 \n\t"
- "movaps %%xmm4, %%xmm7 \n\t"
- "addps %%xmm1, %%xmm0 \n\t"
- "subps %%xmm1, %%xmm3 \n\t"
- "addps %%xmm5, %%xmm4 \n\t"
- "subps %%xmm5, %%xmm7 \n\t"
- "movaps %%xmm0, (%1,%0) \n\t"
- "movaps %%xmm3, (%2,%0) \n\t"
- "movaps %%xmm4, 16(%1,%0) \n\t"
- "movaps %%xmm7, 16(%2,%0) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(p), "r"(p + nloops), "r"(cptr)
- );
- p += nloops*2;
- } while (--j);
- cptr += nloops*2;
- nblocks >>= 1;
- nloops <<= 1;
- } while (nblocks != 0);
+}
+
+#include "../fft_template.h"
+
+DECL_FFT(32,16,8,_interleave)
+DECL_FFT(64,32,16,_interleave)
+DECL_FFT(128,64,32,_interleave)
+DECL_FFT(256,128,64,_interleave)
+DECL_FFT(512,256,128,_interleave)
+DECL_FFT(1024,512,256,_interleave)
+DECL_FFT(2048,1024,512,_interleave)
+DECL_FFT(4096,2048,1024,_interleave)
+DECL_FFT(8192,4096,2048,_interleave)
+DECL_FFT(16384,8192,4096,_interleave)
+DECL_FFT(32768,16384,8192,_interleave)
+DECL_FFT(65536,32768,16384,_interleave)
+
+static void (*fft_dispatch_interleave[])(FFTComplex*) = {
+ fft4, fft8, fft16,
+ fft32_interleave, fft64_interleave, fft128_interleave, fft256_interleave,
+ fft512_interleave, fft1024_interleave, fft2048_interleave, fft4096_interleave,
+ fft8192_interleave, fft16384_interleave, fft32768_interleave, fft65536_interleave,
+};
+
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+
+ fft_dispatch_interleave[s->nbits-2](z);
+
+ if(n <= 16) {
+ x86_reg i = -8*n;
+ asm volatile(
+ "1: \n"
+ "movaps (%0,%1), %%xmm0 \n"
+ "movaps %%xmm0, %%xmm1 \n"
+ "unpcklps 16(%0,%1), %%xmm0 \n"
+ "unpckhps 16(%0,%1), %%xmm1 \n"
+ "movaps %%xmm0, (%0,%1) \n"
+ "movaps %%xmm1, 16(%0,%1) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(z+n)
+ :"memory"
+ );
+ }
+}
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+ int i;
+ for(i=0; i<n; i+=2) {
+ asm volatile(
+ "movaps %2, %%xmm0 \n"
+ "movlps %%xmm0, %0 \n"
+ "movhps %%xmm0, %1 \n"
+ :"=m"(s->tmp_buf[s->revtab[i]]),
+ "=m"(s->tmp_buf[s->revtab[i+1]])
+ :"m"(z[i])
+ );
+ }
+ memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
--
1.5.5.1
More information about the ffmpeg-devel
mailing list