[FFmpeg-devel] [RFC/PATCH] More flexible variafloat_to_int16 , WMA optimization, Vorbis

Loren Merritt lorenm
Thu Jul 17 10:00:26 CEST 2008


On Wed, 16 Jul 2008, Siarhei Siamashka wrote:

> Well, merging the loops that are run after iFFT and combining them with
> windowing code can probably provide interesting results. At least it should
> eliminate a lot of intermediate load and store operations. Maybe having iFFT
> output processed in a single loop could allow reading old saved data and
> also replace it with new saved data at the same time? At least in some
> simple cases when previous and current blocks have the same size.

Kinda ugly, having to merge those functions instead of composing them out 
of small readable dsps. But it works.
total vorbis speedup
k8: 1.3%
conroe: 5%
penryn: 8%
prescott: no change

--Loren Merritt
-------------- next part --------------
Index: i386/fft_3dn2.c
===================================================================
--- i386/fft_3dn2.c	(revision 14255)
+++ i386/fft_3dn2.c	(working copy)
@@ -124,7 +124,7 @@
     asm volatile("femms");
 }
 
-static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
     long n4, n2, n;
     x86_reg k;
@@ -132,7 +132,7 @@
     const FFTSample *tcos = s->tcos;
     const FFTSample *tsin = s->tsin;
     const FFTSample *in1, *in2;
-    FFTComplex *z = (FFTComplex *)tmp;
+    FFTComplex *z = (FFTComplex *)output;
 
     n = 1 << s->nbits;
     n2 = n >> 1;
@@ -164,7 +164,7 @@
 
     ff_fft_calc(&s->fft, z);
 
-    /* post rotation + reordering */
+    /* post rotation */
     for(k = 0; k < n4; k++) {
         asm volatile(
             "movq       %0, %%mm0 \n\t"
@@ -180,6 +180,7 @@
             :"m"(tcos[k]), "m"(tsin[k])
         );
     }
+    asm volatile("femms");
 }
 
 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
@@ -193,7 +194,7 @@
     n2 = n >> 1;
     n8 = n >> 3;
 
-    imdct_3dn2(s, input, tmp);
+    ff_imdct_half_3dn2(s, tmp, input);
 
     k = n-8;
     asm volatile("movd %0, %%mm7" ::"r"(1<<31));
@@ -224,40 +225,3 @@
     asm volatile("femms");
 }
 
-void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
-                        const FFTSample *input, FFTSample *tmp)
-{
-    x86_reg j, k;
-    long n8, n4, n;
-    FFTComplex *z = (FFTComplex *)tmp;
-
-    n = 1 << s->nbits;
-    n4 = n >> 2;
-    n8 = n >> 3;
-
-    imdct_3dn2(s, input, tmp);
-
-    j = -n;
-    k = n-8;
-    asm volatile("movd %0, %%mm7" ::"r"(1<<31));
-    asm volatile(
-        "1: \n\t"
-        "movq    (%3,%1), %%mm0 \n\t" // z[n8+k]
-        "pswapd  (%3,%0), %%mm1 \n\t" // z[n8-1-k]
-        "movq      %%mm0, %%mm2 \n\t"
-        "punpckldq %%mm1, %%mm0 \n\t"
-        "punpckhdq %%mm2, %%mm1 \n\t"
-        "pxor      %%mm7, %%mm0 \n\t"
-        "pxor      %%mm7, %%mm1 \n\t"
-        "movq      %%mm0, (%2,%1) \n\t" // output[n4+2*k]   = { -z[n8+k].re, z[n8-1-k].im }
-        "movq      %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
-        "sub $8, %1 \n\t"
-        "add $8, %0 \n\t"
-        "jl 1b \n\t"
-        :"+r"(j), "+r"(k)
-        :"r"(output+n4), "r"(z+n8)
-        :"memory"
-    );
-    asm volatile("femms");
-}
-
Index: i386/dsputil_mmx.c
===================================================================
--- i386/dsputil_mmx.c	(revision 14255)
+++ i386/dsputil_mmx.c	(working copy)
@@ -67,6 +67,8 @@
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
 
+DECLARE_ALIGNED_16(const int, ff_ps_sign[4]) = { 1<<31, 1<<31, 1<<31, 1<<31 };
+
 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
 
@@ -2022,63 +2024,136 @@
         ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
 }
 
-static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
-                                      const float *win, float add_bias, int len){
-#ifdef HAVE_6REGS
+static void vector_exp_bias(float *dst, x86_reg i, int exp_bias){
+    asm volatile(
+        "movd         %2, %%mm2 \n"
+        "punpckldq %%mm2, %%mm2 \n"
+        "1: \n"
+        "movq    (%1,%0), %%mm0 \n"
+        "movq   8(%1,%0), %%mm1 \n"
+        "paddd  %%mm2,    %%mm0 \n"
+        "paddd  %%mm2,    %%mm1 \n"
+        "movq   %%mm0,  (%1,%0) \n"
+        "movq   %%mm1, 8(%1,%0) \n"
+        "add $16, %0 \n"
+        "jl 1b \n"
+        :"+r"(i)
+        :"r"(dst), "g"(exp_bias)
+        :"memory"
+    );
+}
+
+static void vector_fmul_window_3dnow2(float *dst, float *saved, const float *fft, const float *win,
+                                      int exp_bias, float add_bias, int len, int fft_len){
+#ifdef HAVE_7REGS
     if(add_bias == 0){
+        const float *fft2 = fft + (fft_len>>1);
         x86_reg i = -len*4;
         x86_reg j = len*4-8;
+        dst += len;
+        win += len;
+
+        asm volatile("movd %0, %%mm7" ::"r"(1<<31));
         asm volatile(
+            "pswapd    %%mm7, %%mm6 \n"
             "1: \n"
-            "pswapd  (%5,%1), %%mm1 \n"
-            "movq    (%5,%0), %%mm0 \n"
-            "pswapd  (%4,%1), %%mm5 \n"
-            "movq    (%3,%0), %%mm4 \n"
+            "pswapd  (%5,%0), %%mm0 \n"
+            "movq    (%4,%1), %%mm1 \n"
             "movq      %%mm0, %%mm2 \n"
+            "punpckldq %%mm1, %%mm0 \n"
+            "punpckhdq %%mm1, %%mm2 \n"
+            "pxor      %%mm6, %%mm0 \n" // r0..r1
+            "pxor      %%mm7, %%mm2 \n"
+            "movq    (%3,%0), %%mm1 \n" // l0..l1
+            "movq      %%mm2, (%3,%0) \n"
+            "pswapd  (%6,%1), %%mm5 \n" // wj
+            "movq    (%6,%0), %%mm4 \n" // wi
             "movq      %%mm1, %%mm3 \n"
-            "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
-            "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
-            "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
-            "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
-            "pfadd     %%mm3, %%mm2 \n"
-            "pfsub     %%mm0, %%mm1 \n"
+            "movq      %%mm0, %%mm2 \n"
+            "pfmul     %%mm4, %%mm3 \n" // l*wi
+            "pfmul     %%mm5, %%mm2 \n" // r*wj
+            "pfmul     %%mm5, %%mm1 \n" // l*wj
+            "pfmul     %%mm4, %%mm0 \n" // r*wi
+            "pfadd     %%mm3, %%mm2 \n" // l*wi+r*wj
+            "pfsub     %%mm0, %%mm1 \n" // l*wj+r*wi
             "pswapd    %%mm2, %%mm2 \n"
             "movq      %%mm1, (%2,%0) \n"
             "movq      %%mm2, (%2,%1) \n"
             "sub $8, %1 \n"
             "add $8, %0 \n"
             "jl 1b \n"
-            "femms \n"
             :"+r"(i), "+r"(j)
-            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+            :"r"(dst), "r"(saved), "r"(fft), "r"(fft2), "r"(win)
+            :"memory"
         );
+
+        if(fft_len > len*4){
+            i = len*4-fft_len;
+            j = fft_len-len*4-8;
+            asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+            asm volatile(
+                "1: \n"
+                "movq    (%4,%1), %%mm0 \n"
+                "pswapd  (%5,%0), %%mm2 \n"
+                "movq      %%mm0, %%mm1 \n"
+                "punpckldq %%mm2, %%mm0 \n"
+                "punpckhdq %%mm1, %%mm2 \n"
+                "pxor      %%mm7, %%mm0 \n"
+                "pxor      %%mm7, %%mm2 \n"
+                "movq      %%mm0, (%2,%1) \n"
+                "movq      %%mm2, (%3,%0) \n"
+                "sub $8, %1 \n"
+                "add $8, %0 \n"
+                "jl 1b \n"
+                :"+r"(i), "+r"(j)
+                :"r"(dst+len), "r"(saved-len), "r"(fft+len), "r"(fft2-len)
+                :"memory"
+            );
+
+            if(exp_bias)
+                vector_exp_bias(dst+fft_len/4, len*4-fft_len, exp_bias);
+        }
+        asm volatile("femms");
     }else
 #endif
-        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+        ff_vector_fmul_window_c(dst, saved, fft, win, exp_bias, add_bias, len, fft_len);
 }
 
-static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
-                                   const float *win, float add_bias, int len){
-#ifdef HAVE_6REGS
+static void vector_fmul_window_sse(float *dst, float *saved, const float *fft, const float *win,
+                                   int exp_bias, float add_bias, int len, int fft_len){
+#ifdef HAVE_7REGS
     if(add_bias == 0){
+        const float *fft2 = fft + (fft_len>>1);
         x86_reg i = -len*4;
         x86_reg j = len*4-16;
+        dst += len;
+        win += len;
+
+        asm volatile("movaps %0, %%xmm7"::"m"(*ff_ps_sign));
         asm volatile(
             "1: \n"
-            "movaps       (%5,%1), %%xmm1 \n"
-            "movaps       (%5,%0), %%xmm0 \n"
-            "movaps       (%4,%1), %%xmm5 \n"
-            "movaps       (%3,%0), %%xmm4 \n"
-            "shufps $0x1b, %%xmm1, %%xmm1 \n"
+            "movaps       (%4,%1), %%xmm2 \n"
+            "movaps       (%5,%0), %%xmm1 \n"
+            "xorps         %%xmm7, %%xmm2 \n"
+            "movaps        %%xmm2, %%xmm0 \n"
+            "shufps $0x8d, %%xmm1, %%xmm2 \n"
+            "shufps $0xd8, %%xmm1, %%xmm0 \n"
+            "shufps $0x36, %%xmm2, %%xmm2 \n"
+            "shufps $0x36, %%xmm0, %%xmm0 \n" // r0..r3
+            "xorps         %%xmm7, %%xmm2 \n"
+            "movaps       (%3,%0), %%xmm1 \n" // l0..l3
+            "movaps        %%xmm2, (%3,%0) \n"
+            "movaps       (%6,%1), %%xmm5 \n" // wj
+            "movaps       (%6,%0), %%xmm4 \n" // wi
             "shufps $0x1b, %%xmm5, %%xmm5 \n"
+            "movaps        %%xmm1, %%xmm3 \n"
             "movaps        %%xmm0, %%xmm2 \n"
-            "movaps        %%xmm1, %%xmm3 \n"
-            "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
-            "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
-            "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
-            "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
-            "addps         %%xmm3, %%xmm2 \n"
-            "subps         %%xmm0, %%xmm1 \n"
+            "mulps         %%xmm4, %%xmm3 \n" // l*wi
+            "mulps         %%xmm5, %%xmm2 \n" // r*wj
+            "mulps         %%xmm5, %%xmm1 \n" // l*wj
+            "mulps         %%xmm4, %%xmm0 \n" // r*wi
+            "addps         %%xmm3, %%xmm2 \n" // l*wi+r*wj
+            "subps         %%xmm0, %%xmm1 \n" // l*wj+r*wi
             "shufps $0x1b, %%xmm2, %%xmm2 \n"
             "movaps        %%xmm1, (%2,%0) \n"
             "movaps        %%xmm2, (%2,%1) \n"
@@ -2086,11 +2161,42 @@
             "add $16, %0 \n"
             "jl 1b \n"
             :"+r"(i), "+r"(j)
-            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+            :"r"(dst), "r"(saved), "r"(fft), "r"(fft2), "r"(win)
+            :"memory"
         );
+
+        if(fft_len > len*4){
+            i = len*4-fft_len;
+            j = fft_len-len*4-16;
+            asm volatile(
+                "1: \n"
+                "movaps       (%4,%1), %%xmm2 \n"
+                "movaps       (%5,%0), %%xmm1 \n"
+                "xorps         %%xmm7, %%xmm2 \n"
+                "movaps        %%xmm2, %%xmm0 \n"
+                "shufps $0x8d, %%xmm1, %%xmm2 \n"
+                "shufps $0xd8, %%xmm1, %%xmm0 \n"
+                "shufps $0x36, %%xmm2, %%xmm2 \n"
+                "shufps $0x9c, %%xmm0, %%xmm0 \n"
+                "xorps         %%xmm7, %%xmm2 \n"
+                "movaps        %%xmm0, (%2,%1) \n"
+                "movaps        %%xmm2, (%3,%0) \n"
+                "sub $16, %1 \n"
+                "add $16, %0 \n"
+                "jl 1b \n"
+                :"+r"(i), "+r"(j)
+                :"r"(dst+len), "r"(saved-len), "r"(fft+len), "r"(fft2-len)
+                :"memory"
+            );
+
+            if(exp_bias){
+                vector_exp_bias(dst+fft_len/4, len*4-fft_len, exp_bias);
+                asm volatile("emms");
+            }
+        }
     }else
 #endif
-        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+        ff_vector_fmul_window_c(dst, saved, fft, win, exp_bias, add_bias, len, fft_len);
 }
 
 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
@@ -2678,7 +2784,6 @@
         }
         if(mm_flags & MM_3DNOWEXT){
             c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
-            c->vector_fmul_window = vector_fmul_window_3dnow2;
         }
         if(mm_flags & MM_SSE){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
@@ -2695,6 +2800,8 @@
         }
         if(mm_flags & MM_3DNOW)
             c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+        if(mm_flags & MM_3DNOWEXT)
+            c->vector_fmul_window = vector_fmul_window_3dnow2;
         if(mm_flags & MM_SSE2){
             c->add_int16 = add_int16_sse2;
             c->sub_int16 = sub_int16_sse2;
Index: i386/fft_sse.c
===================================================================
--- i386/fft_sse.c	(revision 14255)
+++ i386/fft_sse.c	(working copy)
@@ -142,7 +142,7 @@
     } while (nblocks != 0);
 }
 
-static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
     x86_reg k;
     long n4, n2, n;
@@ -150,7 +150,7 @@
     const FFTSample *tcos = s->tcos;
     const FFTSample *tsin = s->tsin;
     const FFTSample *in1, *in2;
-    FFTComplex *z = (FFTComplex *)tmp;
+    FFTComplex *z = (FFTComplex *)output;
 
     n = 1 << s->nbits;
     n2 = n >> 1;
@@ -271,7 +271,7 @@
     n2 = n >> 1;
     n8 = n >> 3;
 
-    imdct_sse(s, input, tmp);
+    ff_imdct_half_sse(s, tmp, input);
 
     /*
        Mnemonics:
@@ -313,41 +313,3 @@
     );
 }
 
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
-                       const FFTSample *input, FFTSample *tmp)
-{
-    x86_reg j, k;
-    long n8, n4, n;
-    FFTComplex *z = (FFTComplex *)tmp;
-
-    n = 1 << s->nbits;
-    n4 = n >> 2;
-    n8 = n >> 3;
-
-    imdct_sse(s, input, tmp);
-
-    j = -n;
-    k = n-16;
-    asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
-    asm volatile(
-        "1: \n\t"
-        "movaps     (%3,%1), %%xmm0 \n\t"
-        "movaps     (%3,%0), %%xmm1 \n\t"
-        "xorps       %%xmm7, %%xmm0 \n\t"
-        "movaps      %%xmm0, %%xmm2 \n\t"
-        "shufps $141,%%xmm1, %%xmm0 \n\t"
-        "shufps $216,%%xmm1, %%xmm2 \n\t"
-        "shufps $54, %%xmm0, %%xmm0 \n\t"
-        "shufps $156,%%xmm2, %%xmm2 \n\t"
-        "xorps       %%xmm7, %%xmm0 \n\t"
-        "movaps      %%xmm2, (%2,%1) \n\t"
-        "movaps      %%xmm0, (%2,%0) \n\t"
-        "sub $16, %1 \n\t"
-        "add $16, %0 \n\t"
-        "jl 1b \n\t"
-        :"+r"(j), "+r"(k)
-        :"r"(output+n4), "r"(z+n8)
-        :"memory"
-    );
-}
-
Index: mdct.c
===================================================================
--- mdct.c	(revision 14255)
+++ mdct.c	(working copy)
@@ -100,14 +100,21 @@
     (pim) = _are * _bim + _aim * _bre;\
 }
 
-static void imdct_c(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
+/**
+ * Compute the middle half of the inverse MDCT of size N = 2^nbits.
+ * Post-reordering and symmetry expansion are omitted, to be copmpleted in
+ * either ff_imdct_calc or dsp.vector_fmul_window.
+ * @param output N/2 samples
+ * @param input N/2 samples
+ */
+void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input)
 {
     int k, n4, n2, n, j;
     const uint16_t *revtab = s->fft.revtab;
     const FFTSample *tcos = s->tcos;
     const FFTSample *tsin = s->tsin;
     const FFTSample *in1, *in2;
-    FFTComplex *z = (FFTComplex *)tmp;
+    FFTComplex *z = (FFTComplex *)output;
 
     n = 1 << s->nbits;
     n2 = n >> 1;
@@ -124,8 +131,7 @@
     }
     ff_fft_calc(&s->fft, z);
 
-    /* post rotation + reordering */
-    /* XXX: optimize */
+    /* post rotation */
     for(k = 0; k < n4; k++) {
         CMUL(z[k].re, z[k].im, z[k].re, z[k].im, tcos[k], tsin[k]);
     }
@@ -146,7 +152,7 @@
     n2 = n >> 1;
     n8 = n >> 3;
 
-    imdct_c(s, input, tmp);
+    ff_imdct_half(s, tmp, input);
 
     for(k = 0; k < n8; k++) {
         output[2*k] = -z[n8 + k].im;
@@ -164,32 +170,6 @@
 }
 
 /**
- * Compute the middle half of the inverse MDCT of size N = 2^nbits,
- * thus excluding the parts that can be derived by symmetry
- * @param output N/2 samples
- * @param input N/2 samples
- * @param tmp N/2 samples
- */
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
-                   const FFTSample *input, FFTSample *tmp)
-{
-    int k, n8, n4, n;
-    FFTComplex *z = (FFTComplex *)tmp;
-    n = 1 << s->nbits;
-    n4 = n >> 2;
-    n8 = n >> 3;
-
-    imdct_c(s, input, tmp);
-
-    for(k = 0; k < n8; k++) {
-        output[n4-1-2*k]   =  z[n8+k].im;
-        output[n4-1-2*k-1] = -z[n8-k-1].re;
-        output[n4 + 2*k]   = -z[n8+k].re;
-        output[n4 + 2*k+1] =  z[n8-k-1].im;
-    }
-}
-
-/**
  * Compute MDCT of size N = 2^nbits
  * @param input N samples
  * @param out N/2 samples
Index: dsputil.c
===================================================================
--- dsputil.c	(revision 14255)
+++ dsputil.c	(working copy)
@@ -3930,19 +3930,40 @@
         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
 }
 
-void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
+void ff_vector_fmul_window_c(float *dst, float *saved, const float *fft, const float *win,
+                             int exp_bias, float add_bias, int len, int fft_len){
+    int n2 = fft_len >> 1;
+    int n4 = fft_len >> 2;
     int i,j;
+
     dst += len;
     win += len;
-    src0+= len;
-    for(i=-len, j=len-1; i<0; i++, j--) {
-        float s0 = src0[i];
-        float s1 = src1[j];
-        float wi = win[i];
-        float wj = win[j];
-        dst[i] = s0*wj - s1*wi + add_bias;
-        dst[j] = s0*wi + s1*wj + add_bias;
+    for(i=1-len, j=len-1; i<0; i+=2, j-=2) {
+        float wi, wj;
+        float l0  = saved[i-1];
+        float l1  = saved[i];
+        float r0  =  fft[n2+i];
+        float r1  = -fft[j-1];
+        saved[i-1]= -fft[n2+i-1];
+        saved[i]  =  fft[j];
+        wi = win[i-1];
+        wj = win[j];
+        dst[i-1] = l0*wj - r0*wi + add_bias;
+        dst[j  ] = l0*wi + r0*wj + add_bias;
+        wi = win[i];
+        wj = win[j-1];
+        dst[i  ] = l1*wj - r1*wi + add_bias;
+        dst[j-1] = l1*wi + r1*wj + add_bias;
     }
+    for(i=1-n4, j=n4-1; i<-len; i+=2, j-=2) {
+        dst[j]    =  fft[n2+i] + add_bias;
+        dst[j-1]  = -fft[j-1]  + add_bias;
+        saved[i-1]= -fft[n2+i-1];
+        saved[i]  =  fft[j];
+    }
+    if(exp_bias)
+        for(j=len; j<n4; j++)
+            ((uint32_t*)dst)[j] += exp_bias;
 }
 
 static av_always_inline int float_to_int16_one(const float *src){
Index: dsputil.h
===================================================================
--- dsputil.h	(revision 14255)
+++ dsputil.h	(working copy)
@@ -63,8 +63,8 @@
 
 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
                               const float *src2, int src3, int blocksize, int step);
-void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
-                             const float *win, float add_bias, int len);
+void ff_vector_fmul_window_c(float *dst, float *saved, const float *fft, const float *window,
+                             int exp_bias, float add_bias, int win_len, int fft_len);
 void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
 
 /* encoding scans */
@@ -366,8 +366,11 @@
     void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
     /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
     void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
-    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
-    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
+    /* reorder mdct coefs (from fft order), expand mdct symmetries,
+     * average them with saved[] weighted by window[], store to dst[] and update saved[].
+     * assume len are multiples of 4, fft_len>=win_len, and arrays are 16-byte aligned */
+    void (*vector_fmul_window)(float *dst, float *saved, const float *fft, const float *window,
+                               int exp_bias, float add_bias, int win_len, int fft_len);
 
     /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
      * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
@@ -642,7 +645,7 @@
     void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
                        const FFTSample *input, FFTSample *tmp);
     void (*imdct_half)(struct MDCTContext *s, FFTSample *output,
-                       const FFTSample *input, FFTSample *tmp);
+                       const FFTSample *input);
 } FFTContext;
 
 int ff_fft_init(FFTContext *s, int nbits, int inverse);
@@ -688,16 +691,13 @@
 int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
 void ff_imdct_calc(MDCTContext *s, FFTSample *output,
                 const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
-                   const FFTSample *input, FFTSample *tmp);
+void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
                         const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
-                        const FFTSample *input, FFTSample *tmp);
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
                        const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
-                       const FFTSample *input, FFTSample *tmp);
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc(MDCTContext *s, FFTSample *out,
                const FFTSample *input, FFTSample *tmp);
 void ff_mdct_end(MDCTContext *s);
Index: vorbis_dec.c
===================================================================
--- vorbis_dec.c	(revision 14255)
+++ vorbis_dec.c	(working copy)
@@ -1516,29 +1516,24 @@
 // MDCT, overlap/add, save data for next overlapping  FPMATH
 
     retlen = (blocksize + vc->blocksize[previous_window])/4;
-    dir = retlen <= blocksize/2; // pick an order so that ret[] can reuse residues[] without stepping on any data we need
+    dir = retlen <= blocksize/2; // pick an order so that ret[] can reuse floors[] without stepping on any data we need
     for(j=dir?0:vc->audio_channels-1; (unsigned)j<vc->audio_channels; j+=dir*2-1) {
         uint_fast16_t bs0=vc->blocksize[0];
         uint_fast16_t bs1=vc->blocksize[1];
-        float *residue=vc->channel_residues+res_chan[j]*blocksize/2;
+        float *buf=vc->channel_residues+res_chan[j]*blocksize/2;
         float *floor=vc->channel_floors+j*blocksize/2;
-        float *saved=vc->saved+j*bs1/4;
-        float *ret=vc->channel_residues+j*retlen;
-        float *buf=floor;
+        float *saved=vc->saved+(j+1)*bs1/4;
+        float *ret=vc->channel_floors+j*retlen;
         const float *win=vc->win[blockflag&previous_window];
 
-        vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, floor, residue);
+        vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, floor);
 
-        if(blockflag == previous_window) {
-            vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/4);
-        } else if(blockflag > previous_window) {
-            vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, bs0/4);
-            copy_normalize(ret+bs0/2, buf+bs0/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
+        if(blockflag >= previous_window) {
+            vc->dsp.vector_fmul_window(ret, saved, buf, win, vc->exp_bias, fadd_bias, vc->blocksize[previous_window]/4, blocksize);
         } else {
-            copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
-            vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/4);
+            copy_normalize(ret, saved-bs1/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
+            vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved, buf, win, vc->exp_bias, fadd_bias, bs0/4, blocksize);
         }
-        memcpy(saved, buf+blocksize/4, blocksize/4*sizeof(float));
     }
 
     vc->previous_window = blockflag;
@@ -1582,7 +1577,7 @@
     AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
 
     for(i=0; i<vc->audio_channels; i++)
-        channel_ptrs[i] = vc->channel_residues+i*len;
+        channel_ptrs[i] = vc->channel_floors+i*len;
     vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
     *data_size=len*2*vc->audio_channels;
 



More information about the ffmpeg-devel mailing list