[Ffmpeg-devel] [PATCH] another vorbis optimization

Loren Merritt lorenm
Thu Aug 10 07:21:16 CEST 2006


On Thu, 10 Aug 2006, Luca Barbato wrote:
> Loren Merritt wrote:
>
>>  static int vorbis_parse_audio_packet(vorbis_context *vc) {
>>      GetBitContext *gb=&vc->gb;
>> @@ -1490,6 +1507,7 @@
>>      uint_fast8_t res_num=0;
>>      int_fast16_t retlen=0;
>>      uint_fast16_t saved_start=0;
>> +    int bias= (vc->dsp.float_to_int16 == ff_float_to_int16_c) ? 385 : 0;
>
> what about moving bias somewhere else (like in the vorbis_context) and
> spare a branch?

I guess it's cleaner that way. The difference in speed is not measurable 
(perfectly predictable branches are cheap).

> [...]
>
>>                  saved_start=(vc->blocksize_1-vc->blocksize_0)/4;
>>                  buf += vc->blocksize_1/2;
>> -                for(i=0;i<saved_start;++i) {
>> -                    saved[i]=buf[i];
>> +                if(vc->dsp.float_to_int16 != ff_float_to_int16_c) {
>
> as ugly as could be: what about having 2 bias values in the context:
> scalar [385, 0], vector [0,(15<<23)] and spare some other branches?

One branch (perfectly predictable) vs saving 224 integer additions when 
non-simd. Well, if you value simplicity of the code over speed in this 
less-common case (and I guess I do too, else I would've mmxed this loop), 
then I can remove it.

>> +                    for(i=0; i<saved_start; i++)
>> +                        ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + (15<<23);
>> +                } else {
>> +                    for(i=0; i<saved_start; i++)
>> +                        saved[i] = buf[i];
>>                  }
>
> [....]
>
>> +        int_fast32_t tmp = ((int32_t*)src)[i];
>> +        if(tmp & 0xf0000){
>> +//          tmp = (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower
>> +            if(tmp > 0x43c0ffff) tmp = 0xFFFF;
>> +            else                 tmp = 0;
>> +        }
>
> Is it really slower now?

No, the commented-out version is faster here. But the question that 
matters is: Which is faster on cpus that don't have sse nor 3dnow? And I 
can't benchmark that.

--Loren Merritt
-------------- next part --------------
Index: vorbis.c
===================================================================
--- vorbis.c	(revision 5954)
+++ vorbis.c	(working copy)
@@ -192,6 +192,11 @@
         av_free(vc->mappings[i].mux);
     }
     av_freep(&vc->mappings);
+
+    if(vc->exp_bias){
+        av_freep(&vc->swin);
+        av_freep(&vc->lwin);
+    }
 }
 
 // Parse setup header -------------------------------------------------
@@ -888,6 +893,19 @@
     vc->swin=vwin[bl0-6];
     vc->lwin=vwin[bl1-6];
 
+    if(vc->exp_bias){
+        int i;
+        float *win;
+        win = av_malloc(vc->blocksize_0/2 * sizeof(float));
+        for(i=0; i<vc->blocksize_0/2; i++)
+            win[i] = vc->swin[i] * (1<<15);
+        vc->swin = win;
+        win = av_malloc(vc->blocksize_1/2 * sizeof(float));
+        for(i=0; i<vc->blocksize_1/2; i++)
+            win[i] = vc->lwin[i] * (1<<15);
+        vc->lwin = win;
+    }
+
     if ((get_bits1(gb)) == 0) {
         av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n");
         return 2;
@@ -931,6 +949,14 @@
     vc->avccontext = avccontext;
     dsputil_init(&vc->dsp, avccontext);
 
+    if(vc->dsp.float_to_int16 == ff_float_to_int16_c) {
+        vc->add_bias = 385;
+        vc->exp_bias = 0;
+    } else {
+        vc->add_bias = 0;
+        vc->exp_bias = 15<<23;
+    }
+
     if (!headers_len) {
         av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n");
         return -1;
@@ -1472,7 +1498,6 @@
 }
 
 // Decode the audio packet using the functions above
-#define BIAS 385
 
 static int vorbis_parse_audio_packet(vorbis_context *vc) {
     GetBitContext *gb=&vc->gb;
@@ -1490,6 +1515,7 @@
     uint_fast8_t res_num=0;
     int_fast16_t retlen=0;
     uint_fast16_t saved_start=0;
+    float fadd_bias = vc->add_bias;
 
     if (get_bits1(gb)) {
         av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n");
@@ -1576,10 +1602,7 @@
 
     for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) {
         ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2;
-
-        for(i=0;i<blocksize/2;++i) {
-            ch_floor_ptr[i]*=ch_res_ptr[i]; //FPMATH
-        }
+        vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize/2);
     }
 
 // MDCT, overlap/add, save data for next overlapping  FPMATH
@@ -1600,59 +1623,54 @@
 
         vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
 
+        //FIXME process channels together, to allow faster simd vector_fmul_add_add?
         if (vc->modes[mode_number].blockflag) {
             // -- overlap/add
             if (previous_window) {
-                for(k=j, i=0;i<vc->blocksize_1/2;++i, k+=step) {
-                    ret[k]=saved[i]+buf[i]*lwin[i]+BIAS;
-                }
+                vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize_1/2, step);
                 retlen=vc->blocksize_1/2;
             } else {
-                buf += (vc->blocksize_1-vc->blocksize_0)/4;
-                for(k=j, i=0;i<vc->blocksize_0/2;++i, k+=step) {
-                    ret[k]=saved[i]+buf[i]*swin[i]+BIAS;
-                }
+                int len = (vc->blocksize_1-vc->blocksize_0)/4;
+                buf += len;
+                vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize_0/2, step);
+                k = vc->blocksize_0/2*step + j;
                 buf += vc->blocksize_0/2;
-                for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) {
-                    ret[k]=buf[i]+BIAS;
+                if(vc->exp_bias){
+                    for(i=0; i<len; i++, k+=step)
+                        ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + vc->exp_bias; // ret[k]=buf[i]*(1<<bias)
+                } else {
+                    for(i=0; i<len; i++, k+=step)
+                        ret[k] = buf[i] + fadd_bias;
                 }
                 buf=vc->buf;
-                retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4;
+                retlen=vc->blocksize_0/2+len;
             }
             // -- save
             if (next_window) {
                 buf += vc->blocksize_1/2;
-                lwin += vc->blocksize_1/2-1;
-                for(i=0;i<vc->blocksize_1/2;++i) {
-                    saved[i]=buf[i]*lwin[-i];
-                }
+                vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize_1/2);
                 saved_start=0;
             } else {
                 saved_start=(vc->blocksize_1-vc->blocksize_0)/4;
                 buf += vc->blocksize_1/2;
-                for(i=0;i<saved_start;++i) {
-                    saved[i]=buf[i];
-                }
-                swin += vc->blocksize_0/2-1;
-                for(i=0;i<vc->blocksize_0/2;++i) {
-                    saved[saved_start+i]=buf[saved_start+i]*swin[-i];
-                }
+                for(i=0; i<saved_start; i++)
+                    ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + vc->exp_bias;
+                vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize_0/2);
             }
         } else {
             // --overlap/add
-            for(k=j, i=0;i<saved_start;++i, k+=step) {
-                ret[k]=saved[i]+BIAS;
+            if(vc->add_bias) {
+                for(k=j, i=0;i<saved_start;++i, k+=step)
+                    ret[k] = saved[i] + fadd_bias;
+            } else {
+                for(k=j, i=0;i<saved_start;++i, k+=step)
+                    ret[k] = saved[i];
             }
-            for(i=0;i<vc->blocksize_0/2;++i, k+=step) {
-                ret[k]=saved[saved_start+i]+buf[i]*swin[i]+BIAS;
-            }
+            vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize_0/2, step);
             retlen=saved_start+vc->blocksize_0/2;
             // -- save
             buf += vc->blocksize_0/2;
-            swin += vc->blocksize_0/2-1;
-            for(i=0;i<vc->blocksize_0/2;++i) {
-                saved[i]=buf[i]*swin[-i];
-            }
+            vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize_0/2);
             saved_start=0;
         }
     }
@@ -1695,16 +1713,7 @@
 
     AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
 
-    for(i=0;i<len;++i) {
-        int_fast32_t tmp= ((int32_t*)vc->ret)[i];
-        if(tmp & 0xf0000){
-//            tmp= (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower
-            if(tmp > 0x43c0ffff) tmp= 0xFFFF;
-            else                 tmp= 0;
-        }
-        ((int16_t*)data)[i]=tmp - 0x8000;
-    }
-
+    vc->dsp.float_to_int16(data, vc->ret, len);
     *data_size=len*2;
 
     return buf_size ;
Index: dsputil.c
===================================================================
--- dsputil.c	(revision 5954)
+++ dsputil.c	(working copy)
@@ -3753,6 +3753,38 @@
 WARPER8_16_SQ(rd8x8_c, rd16_c)
 WARPER8_16_SQ(bit8x8_c, bit16_c)
 
+static void vector_fmul_c(float *dst, const float *src, int len){
+    int i;
+    for(i=0; i<len; i++)
+        dst[i] *= src[i];
+}
+
+static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
+    int i;
+    src1 += len-1;
+    for(i=0; i<len; i++)
+        dst[i] = src0[i] * src1[-i];
+}
+
+void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
+    int i;
+    for(i=0; i<len; i++)
+        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
+}
+
+void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
+    int i;
+    for(i=0; i<len; i++) {
+        int_fast32_t tmp = ((int32_t*)src)[i];
+        if(tmp & 0xf0000){
+//          tmp = (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower
+            if(tmp > 0x43c0ffff) tmp = 0xFFFF;
+            else                 tmp = 0;
+        }
+        dst[i] = tmp - 0x8000;
+    }
+}
+
 /* XXX: those functions should be suppressed ASAP when all IDCTs are
  converted */
 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
@@ -4096,6 +4128,10 @@
 #ifdef CONFIG_VORBIS_DECODER
     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
 #endif
+    c->vector_fmul = vector_fmul_c;
+    c->vector_fmul_reverse = vector_fmul_reverse_c;
+    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
+    c->float_to_int16 = ff_float_to_int16_c;
 
     c->shrink[0]= ff_img_copy_plane;
     c->shrink[1]= ff_shrink22;
Index: dsputil.h
===================================================================
--- dsputil.h	(revision 5954)
+++ dsputil.h	(working copy)
@@ -58,6 +58,10 @@
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
 
+void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
+                              const float *src2, int src3, int blocksize, int step);
+void ff_float_to_int16_c(int16_t *dst, const float *src, int len);
+
 /* encoding scans */
 extern const uint8_t ff_alternate_horizontal_scan[64];
 extern const uint8_t ff_alternate_vertical_scan[64];
@@ -307,8 +311,20 @@
 
     void (*h261_loop_filter)(uint8_t *src, int stride);
 
+    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
     void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
+    void (*vector_fmul)(float *dst, const float *src, int len);
+    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
+    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
+    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
+    void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
 
+    /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
+     * asm versions: convert floats from [-32768.0,32767.0] without rescaling */
+    void (*float_to_int16)(int16_t *dst, const float *src, int len);
+    int float_to_int16_expbias;
+    int float_to_int16_addbias;
+
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
     void (*fdct248)(DCTELEM *block/* align 16*/);
Index: i386/dsputil_mmx.c
===================================================================
--- i386/dsputil_mmx.c	(revision 5965)
+++ i386/dsputil_mmx.c	(working copy)
@@ -2772,6 +2772,199 @@
     }
 }
 
+static void vector_fmul_3dnow(float *dst, const float *src, int len){
+    long i;
+    len >>= 1;
+    for(i=0; i<len; i++) {
+        asm volatile(
+            "movq  %0, %%mm0 \n\t"
+            "pfmul %1, %%mm0 \n\t"
+            "movq  %%mm0, %0 \n\t"
+            :"+m"(dst[i*2])
+            :"m"(src[i*2])
+            :"memory"
+        );
+    }
+    //FIXME move emms into caller?
+    asm volatile("femms");
+}
+static void vector_fmul_sse(float *dst, const float *src, int len){
+    long i;
+    len >>= 2;
+    for(i=0; i<len; i++) {
+        asm volatile(
+            "movaps %0, %%xmm0 \n\t"
+            "mulps  %1, %%xmm0 \n\t"
+            "movaps %%xmm0, %0 \n\t"
+            :"+m"(dst[i*4])
+            :"m"(src[i*4])
+            :"memory"
+        );
+    }
+}
+
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
+    long i = len*4-16;
+    asm volatile(
+        "1: \n\t"
+        "pswapd   8(%1), %%mm0 \n\t"
+        "pswapd    (%1), %%mm1 \n\t"
+        "pfmul  (%3,%0), %%mm0 \n\t"
+        "pfmul 8(%3,%0), %%mm1 \n\t"
+        "movq  %%mm0,  (%2,%0) \n\t"
+        "movq  %%mm1, 8(%2,%0) \n\t"
+        "add   $16, %1 \n\t"
+        "sub   $16, %0 \n\t"
+        "jge   1b \n\t"
+        :"+r"(i), "+r"(src1)
+        :"r"(dst), "r"(src0)
+    );
+    asm volatile("femms");
+}
+static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){
+    long i = len*4-32;
+    asm volatile(
+        "1: \n\t"
+        "pshufd $0x1b, 16(%1), %%xmm0 \n\t"
+        "pshufd $0x1b,   (%1), %%xmm1 \n\t"
+        "mulps        (%3,%0), %%xmm0 \n\t"
+        "mulps      16(%3,%0), %%xmm1 \n\t"
+        "movaps     %%xmm0,   (%2,%0) \n\t"
+        "movaps     %%xmm1, 16(%2,%0) \n\t"
+        "add    $32, %1 \n\t"
+        "sub    $32, %0 \n\t"
+        "jge    1b \n\t"
+        :"+r"(i), "+r"(src1)
+        :"r"(dst), "r"(src0)
+    );
+}
+
+static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
+                                      const float *src2, int src3, int len, int step){
+    long i;
+    if(step == 2 && src3 == 0){
+        i = (len-4)*4;
+        dst += (len-4)*2;
+        asm volatile(
+            "1: \n\t"
+            "movq   (%2,%0),  %%mm0 \n\t"
+            "movq  8(%2,%0),  %%mm1 \n\t"
+            "pfmul  (%3,%0),  %%mm0 \n\t"
+            "pfmul 8(%3,%0),  %%mm1 \n\t"
+            "pfadd  (%4,%0),  %%mm0 \n\t"
+            "pfadd 8(%4,%0),  %%mm1 \n\t"
+            "movd     %%mm0,   (%1) \n\t"
+            "movd     %%mm1, 16(%1) \n\t"
+            "psrlq      $32,  %%mm0 \n\t"
+            "psrlq      $32,  %%mm1 \n\t"
+            "movd     %%mm0,  8(%1) \n\t"
+            "movd     %%mm1, 24(%1) \n\t"
+            "sub  $32, %1 \n\t"
+            "sub  $16, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i), "+r"(dst)
+            :"r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else if(step == 1 && src3 == 0){
+        for(i=0; i<len; i+=2){
+            asm volatile(
+                "movq      %1, %%mm0 \n\t"
+                "pfmul     %2, %%mm0 \n\t"
+                "pfadd     %3, %%mm0 \n\t"
+                "movq   %%mm0, %0    \n\t"
+                :"=m"(dst[i])
+                :"m"(src0[i]), "m"(src1[i]), "m"(src2[i])
+            );
+        }
+    }
+    else
+        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+    asm volatile("femms");
+}
+static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
+                                    const float *src2, float src3, int len, int step){
+    long i;
+    if(step == 2 && src3 == 0){
+        i = (len-8)*4;
+        dst += (len-8)*2;
+        asm volatile(
+            "1: \n\t"
+            "movaps   (%2,%0), %%xmm0 \n\t"
+            "movaps 16(%2,%0), %%xmm1 \n\t"
+            "mulps    (%3,%0), %%xmm0 \n\t"
+            "mulps  16(%3,%0), %%xmm1 \n\t"
+            "addps    (%4,%0), %%xmm0 \n\t"
+            "addps  16(%4,%0), %%xmm1 \n\t"
+            "movd      %%xmm0,   (%1) \n\t"
+            "movd      %%xmm1, 32(%1) \n\t"
+            "psrldq        $4, %%xmm0 \n\t"
+            "psrldq        $4, %%xmm1 \n\t"
+            "movd      %%xmm0,  8(%1) \n\t"
+            "movd      %%xmm1, 40(%1) \n\t"
+            "psrldq        $4, %%xmm0 \n\t"
+            "psrldq        $4, %%xmm1 \n\t"
+            "movd      %%xmm0, 16(%1) \n\t"
+            "movd      %%xmm1, 48(%1) \n\t"
+            "psrldq        $4, %%xmm0 \n\t"
+            "psrldq        $4, %%xmm1 \n\t"
+            "movd      %%xmm0, 24(%1) \n\t"
+            "movd      %%xmm1, 56(%1) \n\t"
+            "sub  $64, %1 \n\t"
+            "sub  $32, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i), "+r"(dst)
+            :"r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else if(step == 1 && src3 == 0){
+        for(i=0; i<len; i+=4){
+            asm volatile(
+                "movaps     %1, %%xmm0 \n\t"
+                "mulps      %2, %%xmm0 \n\t"
+                "addps      %3, %%xmm0 \n\t"
+                "movaps %%xmm0, %0     \n\t"
+                :"=m"(dst[i])
+                :"m"(src0[i]), "m"(src1[i]), "m"(src2[i])
+            );
+        }
+    }
+    else
+        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+}
+
+void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
+    // not bit-exact: pf2id uses different rounding than C and SSE
+    int i;
+    for(i=0; i<len; i+=4) {
+        asm volatile(
+            "pf2id       %1, %%mm0 \n\t"
+            "pf2id       %2, %%mm1 \n\t"
+            "packssdw %%mm1, %%mm0 \n\t"
+            "movq     %%mm0, %0    \n\t"
+            :"=m"(dst[i])
+            :"m"(src[i]), "m"(src[i+2])
+        );
+    }
+    asm volatile("femms");
+}
+void float_to_int16_sse(int16_t *dst, const float *src, int len){
+    int i;
+    for(i=0; i<len; i+=4) {
+        asm volatile(
+            "cvtps2pi    %1, %%mm0 \n\t"
+            "cvtps2pi    %2, %%mm1 \n\t"
+            "packssdw %%mm1, %%mm0 \n\t"
+            "movq     %%mm0, %0    \n\t"
+            :"=m"(dst[i])
+            :"m"(src[i]), "m"(src[i+2])
+        );
+    }
+    asm volatile("emms");
+}
+
 #ifdef CONFIG_SNOW_ENCODER
 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
@@ -3199,10 +3392,24 @@
         }
 #endif
 
-        if(mm_flags & MM_SSE)
+        if(mm_flags & MM_3DNOW){
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+            c->vector_fmul = vector_fmul_3dnow;
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT))
+                c->float_to_int16 = float_to_int16_3dnow;
+        }
+        if(mm_flags & MM_3DNOWEXT)
+            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+        if(mm_flags & MM_SSE){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-        else if(mm_flags & MM_3DNOW)
-            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+            c->vector_fmul = vector_fmul_sse;
+            c->vector_fmul_add_add = vector_fmul_add_add_sse;
+            c->float_to_int16 = float_to_int16_sse;
+        }
+        if(mm_flags & MM_SSE2)
+            c->vector_fmul_reverse = vector_fmul_reverse_sse2;
+        if(mm_flags & MM_3DNOW)
+            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
     }
 
 #ifdef CONFIG_ENCODERS



More information about the ffmpeg-devel mailing list