[Ffmpeg-devel] [PATCH] another vorbis optimization
Loren Merritt
lorenm
Tue Aug 8 21:50:48 CEST 2006
On Tue, 8 Aug 2006, Michael Niedermayer wrote:
> [...]
>> + ::"r"(15<<23)
>> + );
>> + for(i=0;i<len;i+=4) {
>> + asm volatile(
>> + "movdqa %1, %%xmm0 \n\t"
>> + "paddd %%xmm7, %%xmm0 \n\t"
>
> i think that can be avoided by simply multiplying the windows by 1<<15
>
>> + "cvtps2dq %%xmm0, %%xmm0 \n\t"
>
> if that is replaced by cvtps2pi and the code below changed accordingly then
> the code should run on SSE1 cpus, if its slower a seperate SSE1 variant
> could be added too, thats of course just an idea, iam happy with SSE2 code
> too, just my cpu here isnt :)
Yes, moving the mult into the window is slightly faster, and once
that's done cvtps2pi doesn't lose any speed.
--Loren Merritt
-------------- next part --------------
Index: vorbis.c
===================================================================
--- vorbis.c (revision 5954)
+++ vorbis.c (working copy)
@@ -192,6 +192,11 @@
av_free(vc->mappings[i].mux);
}
av_freep(&vc->mappings);
+
+#ifdef HAVE_SSE
+ av_freep(&vc->swin);
+ av_freep(&vc->lwin);
+#endif
}
// Parse setup header -------------------------------------------------
@@ -888,6 +893,21 @@
vc->swin=vwin[bl0-6];
vc->lwin=vwin[bl1-6];
+#ifdef HAVE_SSE
+ {
+ int i;
+ float *win;
+ win = av_malloc(vc->blocksize_0/2 * sizeof(float));
+ for(i=0; i<vc->blocksize_0/2; i++)
+ win[i] = vc->swin[i] * (1<<15);
+ vc->swin = win;
+ win = av_malloc(vc->blocksize_1/2 * sizeof(float));
+ for(i=0; i<vc->blocksize_1/2; i++)
+ win[i] = vc->lwin[i] * (1<<15);
+ vc->lwin = win;
+ }
+#endif
+
if ((get_bits1(gb)) == 0) {
av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n");
return 2;
@@ -1472,7 +1492,11 @@
}
// Decode the audio packet using the functions above
+#ifdef HAVE_SSE
+#define BIAS 0
+#else
#define BIAS 385
+#endif
static int vorbis_parse_audio_packet(vorbis_context *vc) {
GetBitContext *gb=&vc->gb;
@@ -1614,7 +1638,11 @@
}
buf += vc->blocksize_0/2;
for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) {
+#ifdef HAVE_SSE
+ ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + (15<<23); // ret[k]=buf[i]*(1<<15)
+#else
ret[k]=buf[i]+BIAS;
+#endif
}
buf=vc->buf;
retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4;
@@ -1631,7 +1659,11 @@
saved_start=(vc->blocksize_1-vc->blocksize_0)/4;
buf += vc->blocksize_1/2;
for(i=0;i<saved_start;++i) {
+#ifdef HAVE_SSE
+ ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + (15<<23);
+#else
saved[i]=buf[i];
+#endif
}
swin += vc->blocksize_0/2-1;
for(i=0;i<vc->blocksize_0/2;++i) {
@@ -1695,6 +1727,18 @@
AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
+#ifdef HAVE_SSE
+ for(i=0;i<len;i+=4) {
+ asm volatile(
+ "cvtps2pi %1, %%mm0 \n\t"
+ "cvtps2pi %2, %%mm1 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"=m"(((int16_t*)data)[i])
+ :"m"(vc->ret[i]), "m"(vc->ret[i+2])
+ );
+ }
+#else
for(i=0;i<len;++i) {
int_fast32_t tmp= ((int32_t*)vc->ret)[i];
if(tmp & 0xf0000){
@@ -1704,6 +1748,7 @@
}
((int16_t*)data)[i]=tmp - 0x8000;
}
+#endif
*data_size=len*2;
More information about the ffmpeg-devel
mailing list