[Mplayer-cvslog] CVS: main/libac3/mmx imdct_3dnow.c,NONE,1.1 srfft_3dnow.c,NONE,1.1

Wed May 23 10:20:18 CEST 2001

Update of /cvsroot/mplayer/main/libac3/mmx
In directory usw-pr-cvs1:/tmp/cvs-serv11145/main/libac3/mmx

Added Files:
	imdct_3dnow.c srfft_3dnow.c 
Log Message:
initial 3dnow support

--- NEW FILE ---
/* 
 *  imdct.c
 *
 *	Copyright (C) Aaron Holtzman - May 1999
 *
 *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
 *	
 *  ac3dec is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  ac3dec is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 *  Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations 
 *  by Nick Kurshev <nickols_k at mail.ru>
 */

/**
 *
 **/

void imdct_do_512(float data[],float delay[])
{
	int i, j;
	float tmp_a_r, tmp_a_i;
	float *data_ptr;
	float *delay_ptr;
	float *window_ptr;

// 512 IMDCT with source and dest data in 'data'
// Pre IFFT complex multiply plus IFFT complex conjugate

	for( i=0; i < 128; i++) {
		j = pm128[i];
		//a = (data[256-2*j-1] - data[2*j]) * (xcos1[j] + xsin1[j]);
		//c = data[2*j] * xcos1[j];
		//b = data[256-2*j-1] * xsin1[j];
		//buf1[i].re = a - b + c;
		//buf1[i].im = b + c;
		buf[i].re = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
		buf[i].im = -1.0 * (data[2*j] * xcos1[j] + data[256-2*j-1] * xsin1[j]);
	}

	fft_128p (&buf[0]);

// Post IFFT complex multiply  plus IFFT complex conjugate
	asm volatile ("femms":::"memory");
#ifndef HAVE_3DNOWEX
	asm volatile ("movl $1, %%eax\n\t"
		      "movd %%eax, %%mm4\n\t"
		      "negl %%eax\n\t"
		      "movd %%eax, %%mm5\n\t"
		      "punpckldq %%mm5, %%mm4\n\t" /* 1.0 | -1.0 */
		      "pi2fd %%mm4, %%mm4\n\t":::"eax","memory");
#endif
	for (i=0; i < 128; i++) {
	    asm volatile("movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
		         "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
#ifndef HAVE_3DNOWEX
			 "movq %%mm1, %%mm2\n\t"
			 "psrlq $32, %%mm1\n\t"
			 "punpckldq %%mm2, %%mm1\n\t"
#else			 
		         "pswapd %%mm1, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */
#endif			 
			 "movd %3, %%mm3\n\t" /* ac3_xsin[i] */
			 "punpckldq %2, %%mm3\n\t" /* ac3_xsin[i] | ac3_xcos[i] */
			 "pfmul %%mm3, %%mm0\n\t"
			 "pfmul %%mm3, %%mm1\n\t"
#ifndef HAVE_3DNOWEX
			 "pfmul %%mm4, %%mm0\n\t"
			 "pfacc %%mm1, %%mm0\n\t"
			 "movd %%mm0, 4%0\n\t"
			 "psrlq $32, %%mm0\n\t"
			 "movd %%mm0, %0\n\t"
#else
			 "pfpnacc %%mm1, %%mm0\n\t" /* mm0 = mm0[0] - mm0[1] | mm1[0] + mm1[1] */
			 "pswapd %%mm0, %%mm0\n\t"
			 "movq %%mm0, %0"
#endif
			 :"=m"(buf[i])
			 :"0"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i])
			 :"memory");
			 /*
		ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i])  +  (tmp_a_i  * ac3_xsin1[i]);
		ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i])  -  (tmp_a_i  * ac3_xcos1[i]);
		*/
	}
	asm volatile ("femms":::"memory");

	data_ptr = data;
	delay_ptr = delay;
	window_ptr = window;

// Window and convert to real valued signal
	for (i=0; i< 64; i++) {
		*data_ptr++   = -buf[64+i].im   * *window_ptr++ + *delay_ptr++;
		*data_ptr++   = buf[64-i-1].re * *window_ptr++ + *delay_ptr++;
	}

	for(i=0; i< 64; i++) {
		*data_ptr++  = -buf[i].re       * *window_ptr++ + *delay_ptr++;
		*data_ptr++  = buf[128-i-1].im * *window_ptr++ + *delay_ptr++;
	}

// The trailing edge of the window goes into the delay line
	delay_ptr = delay;

	for(i=0; i< 64; i++) {
		*delay_ptr++  = -buf[64+i].re   * *--window_ptr;
		*delay_ptr++  =  buf[64-i-1].im * *--window_ptr;
	}

	for(i=0; i<64; i++) {
		*delay_ptr++  =  buf[i].im       * *--window_ptr;
		*delay_ptr++  = -buf[128-i-1].re * *--window_ptr;
	}
}


/**
 *
 **/

void imdct_do_512_nol (float data[], float delay[])
{
       int i, j;
	float tmp_a_r, tmp_a_i;
	float *data_ptr;
	float *delay_ptr;
	float *window_ptr;

        //
        // 512 IMDCT with source and dest data in 'data'
        //

       // Pre IFFT complex multiply plus IFFT cmplx conjugate
        for( i=0; i < 128; i++)
        {
         /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) */
         j = pm128[i];
         //a = (data[256-2*j-1] - data[2*j]) * (xcos1[j] + xsin1[j]);
         //c = data[2*j] * xcos1[j];
         //b = data[256-2*j-1] * xsin1[j];
          //buf1[i].re = a - b + c;
         //buf1[i].im = b + c;
         buf[i].re = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
         buf[i].im = -1.0 * (data[2*j] * xcos1[j] + data[256-2*j-1] * xsin1[j]);
        }

       fft_128p(&buf[0]);

        /* Post IFFT complex multiply  plus IFFT complex conjugate*/
        for( i=0; i < 128; i++)
        {
                /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
               /* int j1 = i; */
               tmp_a_r = buf[i].re;
               tmp_a_i = buf[i].im;
               //a = (tmp_a_r - tmp_a_i) * (xcos1[j] + xsin1[j]);
               //b = tmp_a_r * xsin1[j];
               //c = tmp_a_i * xcos1[j];
               //buf[j].re = a - b + c;
               //buf[j].im = b + c;
               buf[i].re =(tmp_a_r * xcos1[i])  +  (tmp_a_i  * xsin1[i]);
               buf[i].im =(tmp_a_r * xsin1[i])  -  (tmp_a_i  * xcos1[i]);
        }

        data_ptr = data;
        delay_ptr = delay;
        window_ptr = window;

       /* Window and convert to real valued signal, no overlap here*/
        for(i=0; i< 64; i++) {
               *data_ptr++   = -buf[64+i].im   * *window_ptr++;
               *data_ptr++   = buf[64-i-1].re * *window_ptr++;
        }

        /* The trailing edge of the window goes into the delay line */

   delay_ptr = delay;

        for(i=0; i< 64; i++) {
               *delay_ptr++  = -buf[64+i].re   * *--window_ptr;
               *delay_ptr++  =  buf[64-i-1].im * *--window_ptr;
        }

        for(i=0; i<64; i++) {
               *delay_ptr++  =  buf[i].im       * *--window_ptr;
               *delay_ptr++  = -buf[128-i-1].re * *--window_ptr;
        }
}


/**
 *
 **/

void imdct_do_256 (float data[],float delay[])
{
	int i, j, k;
	int p, q;

	float tmp_a_i;
	float tmp_a_r;

	float *data_ptr;
	float *delay_ptr;
	float *window_ptr;

	complex_t *buf1, *buf2;

	buf1 = &buf[0];
	buf2 = &buf[64];

// Pre IFFT complex multiply plus IFFT complex conjugate
	for (k=0; k<64; k++) { 
		/* X1[k] = X[2*k]	*/
		/* X2[k] = X[2*k+1]	*/

		j = pm64[k];
		p = 2 * (128-2*j-1);
		q = 2 * (2 * j);

		/* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
		buf1[k].re =         data[p] * xcos2[j] - data[q] * xsin2[j];
		buf1[k].im = -1.0f * (data[q] * xcos2[j] + data[p] * xsin2[j]);
		/* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
		buf2[k].re =          data[p + 1] * xcos2[j] - data[q + 1] * xsin2[j];
		buf2[k].im = -1.0f * ( data[q + 1] * xcos2[j] + data[p + 1] * xsin2[j]);
	}

	fft_64p(&buf1[0]);
	fft_64p(&buf2[0]);

#ifdef DEBUG
       //DEBUG FFT
#if 0
	printf ("Post FFT, buf1\n");
	for (i=0; i < 64; i++)
		printf("%d %f %f\n", i, buf_1[i].re, buf_1[i].im);
	printf ("Post FFT, buf2\n");
	for (i=0; i < 64; i++)
		printf("%d %f %f\n", i, buf_2[i].re, buf_2[i].im);
#endif
#endif


	// Post IFFT complex multiply 
	for( i=0; i < 64; i++) {
		tmp_a_r =  buf1[i].re;
		tmp_a_i = -buf1[i].im;
		buf1[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
		buf1[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
		tmp_a_r =  buf2[i].re;
		tmp_a_i = -buf2[i].im;
		buf2[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
		buf2[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
	}
	
	data_ptr = data;
	delay_ptr = delay;
	window_ptr = window;

	/* Window and convert to real valued signal */
	for(i=0; i< 64; i++) { 
		*data_ptr++  = -buf1[i].im      * *window_ptr++ + *delay_ptr++;
		*data_ptr++  = buf1[64-i-1].re * *window_ptr++ + *delay_ptr++;
	}

	for(i=0; i< 64; i++) {
		*data_ptr++  = -buf1[i].re      * *window_ptr++ + *delay_ptr++;
		*data_ptr++  = buf1[64-i-1].im * *window_ptr++ + *delay_ptr++;
	}
	
	delay_ptr = delay;

	for(i=0; i< 64; i++) {
		*delay_ptr++ = -buf2[i].re      * *--window_ptr;
		*delay_ptr++ =  buf2[64-i-1].im * *--window_ptr;
	}

	for(i=0; i< 64; i++) {
		*delay_ptr++ =  buf2[i].im      * *--window_ptr;
		*delay_ptr++ = -buf2[64-i-1].re * *--window_ptr;
	}
}


/**
 *
 **/

void imdct_do_256_nol (float data[], float delay[])
{
	int i, j, k;
	int p, q;

	float tmp_a_i;
	float tmp_a_r;

	float *data_ptr;
	float *delay_ptr;
	float *window_ptr;

	complex_t *buf1, *buf2;

	buf1 = &buf[0];
	buf2 = &buf[64];

       /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
	for(k=0; k<64; k++) {
               /* X1[k] = X[2*k]  */
               /* X2[k] = X[2*k+1]     */
               j = pm64[k];
               p = 2 * (128-2*j-1);
               q = 2 * (2 * j);

               /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
               buf1[k].re =         data[p] * xcos2[j] - data[q] * xsin2[j];
               buf1[k].im = -1.0f * (data[q] * xcos2[j] + data[p] * xsin2[j]);
               /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
               buf2[k].re =          data[p + 1] * xcos2[j] - data[q + 1] * xsin2[j];
               buf2[k].im = -1.0f * ( data[q + 1] * xcos2[j] + data[p + 1] * xsin2[j]);
       }


       fft_64p(&buf1[0]);
       fft_64p(&buf2[0]);

#ifdef DEBUG
	//DEBUG FFT
#if 0
       printf("Post FFT, buf1\n");
       for (i=0; i < 64; i++)
               printf("%d %f %f\n", i, buf_1[i].re, buf_1[i].im);
       printf("Post FFT, buf2\n");
       for (i=0; i < 64; i++)
               printf("%d %f %f\n", i, buf_2[i].re, buf_2[i].im);
#endif
#endif

       /* Post IFFT complex multiply */
       for( i=0; i < 64; i++) {
               /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
               tmp_a_r =  buf1[i].re;
               tmp_a_i = -buf1[i].im;
               buf1[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
               buf1[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
               /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
               tmp_a_r =  buf2[i].re;
               tmp_a_i = -buf2[i].im;
               buf2[i].re =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
               buf2[i].im =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
       }
      
       data_ptr = data;
       delay_ptr = delay;
       window_ptr = window;

       /* Window and convert to real valued signal, no overlap */
       for(i=0; i< 64; i++) {
               *data_ptr++  = -buf1[i].im      * *window_ptr++;
               *data_ptr++  = buf1[64-i-1].re * *window_ptr++;
       }

       for(i=0; i< 64; i++) {
               *data_ptr++  = -buf1[i].re      * *window_ptr++ + *delay_ptr++;
               *data_ptr++  = buf1[64-i-1].im * *window_ptr++ + *delay_ptr++;
       }

       delay_ptr = delay;

       for(i=0; i< 64; i++) {
               *delay_ptr++ = -buf2[i].re      * *--window_ptr;
               *delay_ptr++ =  buf2[64-i-1].im * *--window_ptr;
       }

       for(i=0; i< 64; i++) {
               *delay_ptr++ =  buf2[i].im      * *--window_ptr;
               *delay_ptr++ = -buf2[64-i-1].re * *--window_ptr;
	}
}

--- NEW FILE ---
/* 
 *  srfft.c
 *
 *  Copyright (C) Yuqing Deng <Yuqing_Deng at brown.edu> - April 2000
 *
 *  64 and 128 point split radix fft for ac3dec
 *
 *  The algorithm is desribed in the book:
 *  "Computational Frameworks of the Fast Fourier Transform".
 *
 *  The ideas and the the organization of code borrowed from djbfft written by
 *  D. J. Bernstein <djb at cr.py.to>.  djbff can be found at 
 *  http://cr.yp.to/djbfft.html.
 *
 *  srfft.c is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  srfft.c is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */
/*
 * Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations support
 * by Nick Kurshev <nickols_k at mail.ru>
 */

void fft_4(complex_t *x)
{
  /* delta_p = 1 here */
  /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} 
   */

  register complex_t yt, yb, u, vi;
  
  yt.re = x[0].re;
  yb.re = yt.re - x[2].re;
  yt.re += x[2].re;

  u.re = x[1].re;
  vi.im = x[3].re - u.re;
  u.re += x[3].re;
  
  u.im = x[1].im;
  vi.re = u.im - x[3].im;
  u.im += x[3].im;

  yt.im = yt.re;
  yt.im += u.re;
  x[0].re = yt.im;
  yt.re -= u.re;
  x[2].re = yt.re;
  yt.im = yb.re;
  yt.im += vi.re;
  x[1].re = yt.im;
  yb.re -= vi.re;
  x[3].re = yb.re;

  yt.im = x[0].im;
  yb.im = yt.im - x[2].im;
  yt.im += x[2].im;

  yt.re = yt.im;
  yt.re += u.im;
  x[0].im = yt.re;
  yt.im -= u.im;
  x[2].im = yt.im;
  yt.re = yb.im;
  yt.re += vi.im;
  x[1].im = yt.re;
  yb.im -= vi.im;
  x[3].im = yb.im;
}
#if 0
/* is never called */
void fftu_4(complex_t *x)
{
  /* delta_p = 1 here */
  /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{2*pi/4} 
   */

  register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
  
  yt_r = x[0].re;
  yb_r = yt_r - x[2].re;
  yt_r += x[2].re;

  u_r = x[1].re;
  vi_i = x[3].re - u_r;
  u_r += x[3].re;
  
  u_i = x[1].im;
  vi_r = u_i - x[3].im;
  u_i += x[3].im;

  yt_i = yt_r;
  yt_i += u_r;
  x[0].re = yt_i;
  yt_r -= u_r;
  x[2].re = yt_r;
  yt_i = yb_r;
  yt_i += vi_r;
  x[3].re = yt_i;
  yb_r -= vi_r;
  x[1].re = yb_r;

  yt_i = x[0].im;
  yb_i = yt_i - x[2].im;
  yt_i += x[2].im;

  yt_r = yt_i;
  yt_r += u_i;
  x[0].im = yt_r;
  yt_i -= u_i;
  x[2].im = yt_i;
  yt_r = yb_i;
  yt_r += vi_i;
  x[3].im = yt_r;
  yb_i -= vi_i;
  x[1].im = yb_i;
}
#endif
void fft_8(complex_t *x)
{
  /* delta_p = diag{1, sqrt(i)} here */
  /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} 
   */
  complex_t wT1, wB1, wT2, wB2;
  
  asm(
	"movq	8%0, %%mm0\n\t"
	"movq	24%0, %%mm1\n\t"
	"movq	%%mm0, %1\n\t"  /* wT1 = x[1]; */
	"movq	%%mm1, %2\n\t" /* wB1 = x[3]; */
	:"=m"(*x),"=m"(wT1), "=m"(wB1)
	:"0"(*x));

  asm(
	"movq	16%0, %%mm0\n\t"
	"movq	32%0, %%mm1\n\t"
	"movq	48%0, %%mm2\n\t"
	"movq	%%mm0, 8%0\n\t"  /* x[1] = x[2]; */
	"movq	%%mm1, 16%0\n\t" /* x[2] = x[4]; */
	"movq	%%mm2, 24%0\n\t" /* x[3] = x[6]; */
	:"=m"(*x)
	:"0"(*x));
  asm volatile("femms":::"memory");
  fft_4(&x[0]);

  
  /* x[0] x[4] */
  wT2.re = x[5].re;
  wT2.re += x[7].re;
  wT2.re += wT1.re;
  wT2.re += wB1.re;
  wT2.im = wT2.re;
  wT2.re += x[0].re;
  wT2.im = x[0].re - wT2.im;
  x[0].re = wT2.re;
  x[4].re = wT2.im;

  wT2.im = x[5].im;
  wT2.im += x[7].im;
  wT2.im += wT1.im;
  wT2.im += wB1.im;
  wT2.re = wT2.im;
  wT2.re += x[0].im;
  wT2.im = x[0].im - wT2.im;
  x[0].im = wT2.re;
  x[4].im = wT2.im;
  
  /* x[2] x[6] */
  wT2.re = x[5].im;
  wT2.re -= x[7].im;
  wT2.re += wT1.im;
  wT2.re -= wB1.im;
  wT2.im = wT2.re;
  wT2.re += x[2].re;
  wT2.im = x[2].re - wT2.im;
  x[2].re = wT2.re;
  x[6].re = wT2.im;

  wT2.im = x[5].re;
  wT2.im -= x[7].re;
  wT2.im += wT1.re;
  wT2.im -= wB1.re;
  wT2.re = wT2.im;
  wT2.re += x[2].im;
  wT2.im = x[2].im - wT2.im;
  x[2].im = wT2.im;
  x[6].im = wT2.re;
  

  /* x[1] x[5] */
  wT2.re = wT1.re;
  wT2.re += wB1.im;
  wT2.re -= x[5].re;
  wT2.re -= x[7].im;
  wT2.im = wT1.im;
  wT2.im -= wB1.re;
  wT2.im -= x[5].im;
  wT2.im += x[7].re;

  wB2.re = wT2.re;
  wB2.re += wT2.im;
  wT2.im -= wT2.re;
  wB2.re *= HSQRT2;
  wT2.im *= HSQRT2;
  wT2.re = wB2.re;
  wB2.re += x[1].re;
  wT2.re =  x[1].re - wT2.re;

  wB2.im = x[5].re;
  x[1].re = wB2.re;
  x[5].re = wT2.re;

  wT2.re = wT2.im;
  wT2.re += x[1].im;
  wT2.im = x[1].im - wT2.im;
  wB2.re = x[5].im;
  x[1].im = wT2.re;
  x[5].im = wT2.im;

  /* x[3] x[7] */
  wT1.re -= wB1.im;
  wT1.im += wB1.re;
  wB1.re = wB2.im - x[7].im;
  wB1.im = wB2.re + x[7].re;
  wT1.re -= wB1.re;
  wT1.im -= wB1.im;
  wB1.re = wT1.re + wT1.im;
  wB1.re *= HSQRT2;
  wT1.im -= wT1.re;
  wT1.im *= HSQRT2;
  wB2.re = x[3].re;
  wB2.im = wB2.re + wT1.im;
  wB2.re -= wT1.im;
  x[3].re = wB2.im;
  x[7].re = wB2.re;
  wB2.im = x[3].im;
  wB2.re = wB2.im + wB1.re;
  wB2.im -= wB1.re;
  x[3].im = wB2.im;
  x[7].im = wB2.re;
}
#if 0
/* is never called */
void fftu_8(complex_t *x)
{
  /* delta_p = diag{1, sqrt(i)} here */
  /* this function computes x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{2*pi/8} 
   */
  register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
  
  /* 2 F_2 on x(1:4:7) and x(3:4:7) */
  /* wB2 is weighted by i */

  wT1_r = x[1].re;
  wT1_i = x[1].im;
  wB1_r = x[3].re;
  wB1_i = x[3].im;

  /* 1 F_4 on x(0:2:7) */
  x[1] = x[2];
  x[2] = x[4];
  x[3] = x[6];
  fft_4(&x[0]);

  
  /* x[0] x[4] */
  wT2_r = x[5].re;
  wT2_r += x[7].re;
  wT2_r += wT1_r;
  wT2_r += wB1_r;
  wT2_i = wT2_r;
  wT2_r += x[0].re;
  wT2_i = x[0].re - wT2_i;
  x[0].re = wT2_r;
  x[4].re = wT2_i;

  wT2_i = x[5].im;
  wT2_i += x[7].im;
  wT2_i += wT1_i;
  wT2_i += wB1_i;
  wT2_r = wT2_i;
  wT2_r += x[0].im;
  wT2_i = x[0].im - wT2_i;
  x[0].im = wT2_r;
  x[4].im = wT2_i;
  
  /* x[2] x[6] */
  wT2_r = x[5].im;
  wT2_r -= x[7].im;
  wT2_r += wT1_i;
  wT2_r -= wB1_i;
  wT2_i = wT2_r;
  wT2_r += x[2].re;
  wT2_i = x[2].re - wT2_i;
  x[2].re = wT2_i;
  x[6].re = wT2_r;

  wT2_i = x[5].re;
  wT2_i -= x[7].re;
  wT2_i += wT1_r;
  wT2_i -= wB1_r;
  wT2_r = wT2_i;
  wT2_r += x[2].im;
  wT2_i = x[2].im - wT2_i;
  x[2].im = wT2_r;
  x[6].im = wT2_i;
  

  /* x[1] x[5] */
  wT2_r = wT1_r;
  wT2_r -= wB1_i;
  wT2_r -= x[5].re;
  wT2_r += x[7].im;
  wT2_i = wT1_i;
  wT2_i += wB1_r;
  wT2_i -= x[5].im;
  wT2_i -= x[7].re;

  wB2_r = wT2_r;
  wB2_r -= wT2_i;
  wT2_i += wT2_r;
  wB2_r *= HSQRT2;
  wT2_i *= HSQRT2;
  wT2_r = wB2_r;
  wB2_r += x[1].re;
  wT2_r =  x[1].re - wT2_r;

  wB2_i = x[5].re;
  x[1].re = wB2_r;
  x[5].re = wT2_r;

  wT2_r = wT2_i;
  wT2_r += x[1].im;
  wT2_i = x[1].im - wT2_i;
  wB2_r = x[5].im;
  x[1].im = wT2_r;
  x[5].im = wT2_i;

  /* x[3] x[7] */
  wT1_r += wB1_i;
  wT1_i -= wB1_r;
  wB1_r = wB2_i + x[7].im;
  wB1_i = wB2_r - x[7].re;
  wT1_r -= wB1_r;
  wT1_i -= wB1_i;
  wB1_r = wT1_r - wT1_i;
  wB1_r *= HSQRT2;
  wT1_i += wT1_r;
  wT1_i *= HSQRT2;
  wB2_r = x[3].re;
  wB2_i = wB2_r + wT1_i;
  wB2_r -= wT1_i;
  x[3].re = wB2_r;
  x[7].re = wB2_i;
  wB2_i = x[3].im;
  wB2_r = wB2_i + wB1_r;
  wB2_i -= wB1_r;
  x[3].im = wB2_r;
  x[7].im = wB2_i;
}
#endif
void fft_asmb(int k, complex_t *x, complex_t *wTB,
	     const complex_t *d, const complex_t *d_3)
{
  register complex_t  *x2k, *x3k, *x4k, *wB;
  register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;

  x2k = x + 2 * k;
  x3k = x2k + 2 * k;
  x4k = x3k + 2 * k;
  wB = wTB + 2 * k;
  
  TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
  TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
  
  --k;
  for(;;) {
     TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
     TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
     if (!--k) break;
     x += 2;
     x2k += 2;
     x3k += 2;
     x4k += 2;
     d += 2;
     d_3 += 2;
     wTB += 2;
     wB += 2;
  }
 
}

void fft_asmb16(complex_t *x, complex_t *wTB)
{
  register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
  int k = 2;

  /* transform x[0], x[8], x[4], x[12] */
  TRANSZERO(x[0],x[4],x[8],x[12]);

  /* transform x[1], x[9], x[5], x[13] */
  TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);

  /* transform x[2], x[10], x[6], x[14] */
  TRANSHALF_16(x[2],x[6],x[10],x[14]);

  /* transform x[3], x[11], x[7], x[15] */
  TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);

} 


_______________________________________________
Mplayer-cvslog mailing list
Mplayer-cvslog at lists.sourceforge.net
http://lists.sourceforge.net/lists/listinfo/mplayer-cvslog