[Ffmpeg-devel] [PATCH] another vorbis optimization

Tue Aug 8 21:50:48 CEST 2006

On Tue, 8 Aug 2006, Michael Niedermayer wrote:
> [...]
>> +        ::"r"(15<<23)
>> +    );
>> +    for(i=0;i<len;i+=4) {
>> +        asm volatile(
>> +            "movdqa       %1, %%xmm0 \n\t"
>> +            "paddd    %%xmm7, %%xmm0 \n\t"
>
> i think that can be avoided by simply multiplying the windows by 1<<15
>
>> +            "cvtps2dq %%xmm0, %%xmm0 \n\t"
>
> if that is replaced by cvtps2pi and the code below changed accordingly then
> the code should run on SSE1 cpus, if its slower a seperate SSE1 variant
> could be added too, thats of course just an idea, iam happy with SSE2 code
> too, just my cpu here isnt :)

Yes, moving the mult into the window is slightly faster, and once 
that's done cvtps2pi doesn't lose any speed.

--Loren Merritt
-------------- next part --------------
Index: vorbis.c
===================================================================

--- vorbis.c	(revision 5954)
+++ vorbis.c	(working copy)
@@ -192,6 +192,11 @@
         av_free(vc->mappings[i].mux);
     }
     av_freep(&vc->mappings);
+
+#ifdef HAVE_SSE
+    av_freep(&vc->swin);
+    av_freep(&vc->lwin);
+#endif
 }
 
 // Parse setup header -------------------------------------------------
@@ -888,6 +893,21 @@
     vc->swin=vwin[bl0-6];
     vc->lwin=vwin[bl1-6];
 
+#ifdef HAVE_SSE
+    {
+        int i;
+        float *win;
+        win = av_malloc(vc->blocksize_0/2 * sizeof(float));
+        for(i=0; i<vc->blocksize_0/2; i++)
+            win[i] = vc->swin[i] * (1<<15);
+        vc->swin = win;
+        win = av_malloc(vc->blocksize_1/2 * sizeof(float));
+        for(i=0; i<vc->blocksize_1/2; i++)
+            win[i] = vc->lwin[i] * (1<<15);
+        vc->lwin = win;
+    }
+#endif
+
     if ((get_bits1(gb)) == 0) {
         av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n");
         return 2;
@@ -1472,7 +1492,11 @@
 }
 
 // Decode the audio packet using the functions above
+#ifdef HAVE_SSE
+#define BIAS 0
+#else
 #define BIAS 385
+#endif
 
 static int vorbis_parse_audio_packet(vorbis_context *vc) {
     GetBitContext *gb=&vc->gb;
@@ -1614,7 +1638,11 @@
                 }
                 buf += vc->blocksize_0/2;
                 for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) {
+#ifdef HAVE_SSE
+                    ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + (15<<23); // ret[k]=buf[i]*(1<<15)
+#else
                     ret[k]=buf[i]+BIAS;
+#endif
                 }
                 buf=vc->buf;
                 retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4;
@@ -1631,7 +1659,11 @@
                 saved_start=(vc->blocksize_1-vc->blocksize_0)/4;
                 buf += vc->blocksize_1/2;
                 for(i=0;i<saved_start;++i) {
+#ifdef HAVE_SSE
+                    ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + (15<<23);
+#else
                     saved[i]=buf[i];
+#endif
                 }
                 swin += vc->blocksize_0/2-1;
                 for(i=0;i<vc->blocksize_0/2;++i) {
@@ -1695,6 +1727,18 @@
 
     AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
 
+#ifdef HAVE_SSE
+    for(i=0;i<len;i+=4) {
+        asm volatile(
+            "cvtps2pi    %1, %%mm0 \n\t"
+            "cvtps2pi    %2, %%mm1 \n\t"
+            "packssdw %%mm1, %%mm0 \n\t"
+            "movq     %%mm0, %0    \n\t"
+            :"=m"(((int16_t*)data)[i])
+            :"m"(vc->ret[i]), "m"(vc->ret[i+2])
+        );
+    }
+#else
     for(i=0;i<len;++i) {
         int_fast32_t tmp= ((int32_t*)vc->ret)[i];
         if(tmp & 0xf0000){
@@ -1704,6 +1748,7 @@
         }
         ((int16_t*)data)[i]=tmp - 0x8000;
     }
+#endif
 
     *data_size=len*2;