[Ffmpeg-cvslog] r5954 - in trunk/libavcodec: dsputil.h fft.c i386/fft_3dn2.c vorbis.c wmadec.c
lorenm
subversion
Tue Aug 8 06:01:06 CEST 2006
Author: lorenm
Date: Tue Aug 8 06:01:04 2006
New Revision: 5954
Modified:
trunk/libavcodec/dsputil.h
trunk/libavcodec/fft.c
trunk/libavcodec/i386/fft_3dn2.c
trunk/libavcodec/vorbis.c
trunk/libavcodec/wmadec.c
Log:
3dnow2 implementation of imdct.
6% faster vorbis and wma.
Modified: trunk/libavcodec/dsputil.h
==============================================================================
--- trunk/libavcodec/dsputil.h (original)
+++ trunk/libavcodec/dsputil.h Tue Aug 8 06:01:04 2006
@@ -594,6 +594,8 @@
FFTSample type */
typedef float FFTSample;
+struct MDCTContext;
+
typedef struct FFTComplex {
FFTSample re, im;
} FFTComplex;
@@ -605,6 +607,8 @@
FFTComplex *exptab;
FFTComplex *exptab1; /* only used by SSE code */
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
+ void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
+ const FFTSample *input, FFTSample *tmp);
} FFTContext;
int ff_fft_init(FFTContext *s, int nbits, int inverse);
@@ -635,6 +639,8 @@
int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
void ff_imdct_calc(MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+ const FFTSample *input, FFTSample *tmp);
void ff_mdct_calc(MDCTContext *s, FFTSample *out,
const FFTSample *input, FFTSample *tmp);
void ff_mdct_end(MDCTContext *s);
Modified: trunk/libavcodec/fft.c
==============================================================================
--- trunk/libavcodec/fft.c (original)
+++ trunk/libavcodec/fft.c Tue Aug 8 06:01:04 2006
@@ -54,6 +54,7 @@
s->exptab[i].im = s1;
}
s->fft_calc = ff_fft_calc_c;
+ s->imdct_calc = ff_imdct_calc;
s->exptab1 = NULL;
/* compute constant table for HAVE_SSE version */
@@ -62,11 +63,7 @@
int has_vectors = 0;
#if defined(HAVE_MMX)
-#ifdef HAVE_MM3DNOW
has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2);
-#else
- has_vectors = mm_support() & (MM_SSE | MM_SSE2);
-#endif
#endif
#if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
has_vectors = mm_support() & MM_ALTIVEC;
@@ -98,6 +95,8 @@
} while (nblocks != 0);
av_freep(&s->exptab);
#if defined(HAVE_MMX)
+ if (has_vectors & MM_3DNOWEXT)
+ s->imdct_calc = ff_imdct_calc_3dn2;
#ifdef HAVE_MM3DNOW
if (has_vectors & MM_3DNOWEXT)
/* 3DNowEx for Athlon(XP) */
Modified: trunk/libavcodec/i386/fft_3dn2.c
==============================================================================
--- trunk/libavcodec/i386/fft_3dn2.c (original)
+++ trunk/libavcodec/i386/fft_3dn2.c Tue Aug 8 06:01:04 2006
@@ -1,6 +1,6 @@
/*
* FFT/MDCT transform with Extended 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie.
+ * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
*
* This library is free software; you can redistribute it and/or
@@ -134,3 +134,84 @@
}
#endif
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+ const FFTSample *input, FFTSample *tmp)
+{
+ int k, n8, n4, n2, n;
+ const uint16_t *revtab = s->fft.revtab;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ const FFTSample *in1, *in2;
+ FFTComplex *z = (FFTComplex *)tmp;
+
+ n = 1 << s->nbits;
+ n2 = n >> 1;
+ n4 = n >> 2;
+ n8 = n >> 3;
+
+ /* pre rotation */
+ in1 = input;
+ in2 = input + n2 - 1;
+ for(k = 0; k < n4; k++) {
+ asm volatile(
+ "movd %1, %%mm0 \n\t"
+ "movd %3, %%mm1 \n\t"
+ "punpckldq %2, %%mm0 \n\t"
+ "punpckldq %4, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "pfmul %%mm1, %%mm0 \n\t"
+ "pswapd %%mm1, %%mm1 \n\t"
+ "pfmul %%mm1, %%mm2 \n\t"
+ "pfpnacc %%mm2, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"=m"(z[revtab[k]])
+ :"m"(in2[-2*k]), "m"(in1[2*k]),
+ "m"(tcos[k]), "m"(tsin[k])
+ );
+ }
+
+ ff_fft_calc(&s->fft, z);
+
+ /* post rotation + reordering */
+ for(k = 0; k < n4; k++) {
+ asm volatile(
+ "movq %0, %%mm0 \n\t"
+ "movd %1, %%mm1 \n\t"
+ "punpckldq %2, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "pfmul %%mm1, %%mm0 \n\t"
+ "pswapd %%mm1, %%mm1 \n\t"
+ "pfmul %%mm1, %%mm2 \n\t"
+ "pfpnacc %%mm2, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"+m"(z[k])
+ :"m"(tcos[k]), "m"(tsin[k])
+ );
+ }
+
+ asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+ for(k = 0; k < n8; k++) {
+ asm volatile(
+ "movq %4, %%mm0 \n\t"
+ "pswapd %5, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "pxor %%mm7, %%mm2 \n\t"
+ "punpckldq %%mm1, %%mm2 \n\t"
+ "pswapd %%mm2, %%mm3 \n\t"
+ "punpckhdq %%mm1, %%mm0 \n\t"
+ "pswapd %%mm0, %%mm4 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "pxor %%mm7, %%mm4 \n\t"
+ "movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re }
+ "movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im }
+ "movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im }
+ "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re }
+ :"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
+ "=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
+ :"m"(z[n8+k]), "m"(z[n8-1-k])
+ :"memory"
+ );
+ }
+ asm volatile("emms");
+}
Modified: trunk/libavcodec/vorbis.c
==============================================================================
--- trunk/libavcodec/vorbis.c (original)
+++ trunk/libavcodec/vorbis.c Tue Aug 8 06:01:04 2006
@@ -1598,7 +1598,7 @@
saved_start=vc->saved_start;
- ff_imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
+ vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
if (vc->modes[mode_number].blockflag) {
// -- overlap/add
Modified: trunk/libavcodec/wmadec.c
==============================================================================
--- trunk/libavcodec/wmadec.c (original)
+++ trunk/libavcodec/wmadec.c Tue Aug 8 06:01:04 2006
@@ -1113,7 +1113,7 @@
n = s->block_len;
n4 = s->block_len / 2;
- ff_imdct_calc(&s->mdct_ctx[bsize],
+ s->mdct_ctx[bsize].fft.imdct_calc(&s->mdct_ctx[bsize],
output, s->coefs[ch], s->mdct_tmp);
/* XXX: optimize all that by build the window and
More information about the ffmpeg-cvslog
mailing list