[Ffmpeg-devel] [PATCH] another vorbis optimization
Loren Merritt
lorenm
Thu Aug 10 03:13:31 CEST 2006
Now with optimized windowing too, for a total of 15% speedup.
--Loren Merritt
-------------- next part --------------
Index: vorbis.c
===================================================================
--- vorbis.c (revision 5954)
+++ vorbis.c (working copy)
@@ -192,6 +192,11 @@
av_free(vc->mappings[i].mux);
}
av_freep(&vc->mappings);
+
+ if(vc->dsp.float_to_int16 != ff_float_to_int16_c){
+ av_freep(&vc->swin);
+ av_freep(&vc->lwin);
+ }
}
// Parse setup header -------------------------------------------------
@@ -888,6 +893,19 @@
vc->swin=vwin[bl0-6];
vc->lwin=vwin[bl1-6];
+ if(vc->dsp.float_to_int16 != ff_float_to_int16_c){
+ int i;
+ float *win;
+ win = av_malloc(vc->blocksize_0/2 * sizeof(float));
+ for(i=0; i<vc->blocksize_0/2; i++)
+ win[i] = vc->swin[i] * (1<<15);
+ vc->swin = win;
+ win = av_malloc(vc->blocksize_1/2 * sizeof(float));
+ for(i=0; i<vc->blocksize_1/2; i++)
+ win[i] = vc->lwin[i] * (1<<15);
+ vc->lwin = win;
+ }
+
if ((get_bits1(gb)) == 0) {
av_log(vc->avccontext, AV_LOG_ERROR, " Vorbis id header packet corrupt (framing flag not set). \n");
return 2;
@@ -1472,7 +1490,6 @@
}
// Decode the audio packet using the functions above
-#define BIAS 385
static int vorbis_parse_audio_packet(vorbis_context *vc) {
GetBitContext *gb=&vc->gb;
@@ -1490,6 +1507,7 @@
uint_fast8_t res_num=0;
int_fast16_t retlen=0;
uint_fast16_t saved_start=0;
+ int bias= (vc->dsp.float_to_int16 == ff_float_to_int16_c) ? 385 : 0;
if (get_bits1(gb)) {
av_log(vc->avccontext, AV_LOG_ERROR, "Not a Vorbis I audio packet.\n");
@@ -1576,10 +1594,7 @@
for(j=0, ch_floor_ptr=vc->channel_floors;j<vc->audio_channels;++j,ch_floor_ptr+=blocksize/2) {
ch_res_ptr=vc->channel_residues+res_chan[j]*blocksize/2;
-
- for(i=0;i<blocksize/2;++i) {
- ch_floor_ptr[i]*=ch_res_ptr[i]; //FPMATH
- }
+ vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize/2);
}
// MDCT, overlap/add, save data for next overlapping FPMATH
@@ -1600,59 +1615,55 @@
vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
+ //FIXME process channels together, to allow faster simd vector_fmul_add_add?
if (vc->modes[mode_number].blockflag) {
// -- overlap/add
if (previous_window) {
- for(k=j, i=0;i<vc->blocksize_1/2;++i, k+=step) {
- ret[k]=saved[i]+buf[i]*lwin[i]+BIAS;
- }
+ vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, bias, vc->blocksize_1/2, step);
retlen=vc->blocksize_1/2;
} else {
- buf += (vc->blocksize_1-vc->blocksize_0)/4;
- for(k=j, i=0;i<vc->blocksize_0/2;++i, k+=step) {
- ret[k]=saved[i]+buf[i]*swin[i]+BIAS;
- }
+ int len = (vc->blocksize_1-vc->blocksize_0)/4;
+ buf += len;
+ vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, bias, vc->blocksize_0/2, step);
+ k = vc->blocksize_0/2*step + j;
buf += vc->blocksize_0/2;
- for(i=0;i<(vc->blocksize_1-vc->blocksize_0)/4;++i, k+=step) {
- ret[k]=buf[i]+BIAS;
+ if(vc->dsp.float_to_int16 != ff_float_to_int16_c) {
+ for(i=0; i<len; i++, k+=step)
+ ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + (15<<23); // ret[k]=buf[i]*(1<<15)
+ } else {
+ for(i=0; i<len; i++, k+=step)
+ ret[k] = buf[i] + bias;
}
buf=vc->buf;
- retlen=vc->blocksize_0/2+(vc->blocksize_1-vc->blocksize_0)/4;
+ retlen=vc->blocksize_0/2+len;
}
// -- save
if (next_window) {
buf += vc->blocksize_1/2;
- lwin += vc->blocksize_1/2-1;
- for(i=0;i<vc->blocksize_1/2;++i) {
- saved[i]=buf[i]*lwin[-i];
- }
+ vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize_1/2);
saved_start=0;
} else {
saved_start=(vc->blocksize_1-vc->blocksize_0)/4;
buf += vc->blocksize_1/2;
- for(i=0;i<saved_start;++i) {
- saved[i]=buf[i];
+ if(vc->dsp.float_to_int16 != ff_float_to_int16_c) {
+ for(i=0; i<saved_start; i++)
+ ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + (15<<23);
+ } else {
+ for(i=0; i<saved_start; i++)
+ saved[i] = buf[i];
}
- swin += vc->blocksize_0/2-1;
- for(i=0;i<vc->blocksize_0/2;++i) {
- saved[saved_start+i]=buf[saved_start+i]*swin[-i];
- }
+ vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize_0/2);
}
} else {
// --overlap/add
for(k=j, i=0;i<saved_start;++i, k+=step) {
- ret[k]=saved[i]+BIAS;
+ ret[k]=saved[i]+bias;
}
- for(i=0;i<vc->blocksize_0/2;++i, k+=step) {
- ret[k]=saved[saved_start+i]+buf[i]*swin[i]+BIAS;
- }
+ vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, bias, vc->blocksize_0/2, step);
retlen=saved_start+vc->blocksize_0/2;
// -- save
buf += vc->blocksize_0/2;
- swin += vc->blocksize_0/2-1;
- for(i=0;i<vc->blocksize_0/2;++i) {
- saved[i]=buf[i]*swin[-i];
- }
+ vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize_0/2);
saved_start=0;
}
}
@@ -1695,16 +1706,7 @@
AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
- for(i=0;i<len;++i) {
- int_fast32_t tmp= ((int32_t*)vc->ret)[i];
- if(tmp & 0xf0000){
-// tmp= (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower
- if(tmp > 0x43c0ffff) tmp= 0xFFFF;
- else tmp= 0;
- }
- ((int16_t*)data)[i]=tmp - 0x8000;
- }
-
+ vc->dsp.float_to_int16(data, vc->ret, len);
*data_size=len*2;
return buf_size ;
Index: dsputil.c
===================================================================
--- dsputil.c (revision 5954)
+++ dsputil.c (working copy)
@@ -3753,6 +3753,38 @@
WARPER8_16_SQ(rd8x8_c, rd16_c)
WARPER8_16_SQ(bit8x8_c, bit16_c)
+static void vector_fmul_c(float *dst, const float *src, int len){
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] *= src[i];
+}
+
+static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
+ int i;
+ src1 += len-1;
+ for(i=0; i<len; i++)
+ dst[i] = src0[i] * src1[-i];
+}
+
+void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
+ int i;
+ for(i=0; i<len; i++)
+ dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
+}
+
+void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
+ int i;
+ for(i=0; i<len; i++) {
+ int_fast32_t tmp = ((int32_t*)src)[i];
+ if(tmp & 0xf0000){
+// tmp = (0x43c0ffff - tmp)>>31; //ask gcc devs why this is slower
+ if(tmp > 0x43c0ffff) tmp = 0xFFFF;
+ else tmp = 0;
+ }
+ dst[i] = tmp - 0x8000;
+ }
+}
+
/* XXX: those functions should be suppressed ASAP when all IDCTs are
converted */
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
@@ -4096,6 +4128,10 @@
#ifdef CONFIG_VORBIS_DECODER
c->vorbis_inverse_coupling = vorbis_inverse_coupling;
#endif
+ c->vector_fmul = vector_fmul_c;
+ c->vector_fmul_reverse = vector_fmul_reverse_c;
+ c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
+ c->float_to_int16 = ff_float_to_int16_c;
c->shrink[0]= ff_img_copy_plane;
c->shrink[1]= ff_shrink22;
Index: dsputil.h
===================================================================
--- dsputil.h (revision 5954)
+++ dsputil.h (working copy)
@@ -58,6 +58,10 @@
void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
+ const float *src2, int src3, int blocksize, int step);
+void ff_float_to_int16_c(int16_t *dst, const float *src, int len);
+
/* encoding scans */
extern const uint8_t ff_alternate_horizontal_scan[64];
extern const uint8_t ff_alternate_vertical_scan[64];
@@ -307,8 +311,18 @@
void (*h261_loop_filter)(uint8_t *src, int stride);
+ /* assume len is a multiple of 4, and arrays are 16-byte aligned */
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
+ void (*vector_fmul)(float *dst, const float *src, int len);
+ /* assume len is a multiple of 8, and arrays are 16-byte aligned */
+ void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
+ /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
+ void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
+ /* C version: convert floats from the range [384.0,386.0] to ints in [-65536,65535]
+ * asm versions: convert floats from [-65536.0,65535.0] without rescaling */
+ void (*float_to_int16)(int16_t *dst, const float *src, int len);
+
/* (I)DCT */
void (*fdct)(DCTELEM *block/* align 16*/);
void (*fdct248)(DCTELEM *block/* align 16*/);
Index: i386/dsputil_mmx.c
===================================================================
--- i386/dsputil_mmx.c (revision 5965)
+++ i386/dsputil_mmx.c (working copy)
@@ -2772,6 +2772,199 @@
}
}
+static void vector_fmul_3dnow(float *dst, const float *src, int len){
+ long i;
+ len >>= 1;
+ for(i=0; i<len; i++) {
+ asm volatile(
+ "movq %0, %%mm0 \n\t"
+ "pfmul %1, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"+m"(dst[i*2])
+ :"m"(src[i*2])
+ :"memory"
+ );
+ }
+ //FIXME move emms into caller?
+ asm volatile("femms");
+}
+static void vector_fmul_sse(float *dst, const float *src, int len){
+ long i;
+ len >>= 2;
+ for(i=0; i<len; i++) {
+ asm volatile(
+ "movaps %0, %%xmm0 \n\t"
+ "mulps %1, %%xmm0 \n\t"
+ "movaps %%xmm0, %0 \n\t"
+ :"+m"(dst[i*4])
+ :"m"(src[i*4])
+ :"memory"
+ );
+ }
+}
+
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
+ long i = len*4-16;
+ asm volatile(
+ "1: \n\t"
+ "pswapd 8(%1), %%mm0 \n\t"
+ "pswapd (%1), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
+ "movq %%mm0, (%2,%0) \n\t"
+ "movq %%mm1, 8(%2,%0) \n\t"
+ "add $16, %1 \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(src1)
+ :"r"(dst), "r"(src0)
+ );
+ asm volatile("femms");
+}
+static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){
+ long i = len*4-32;
+ asm volatile(
+ "1: \n\t"
+ "pshufd $0x1b, 16(%1), %%xmm0 \n\t"
+ "pshufd $0x1b, (%1), %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
+ "movaps %%xmm0, (%2,%0) \n\t"
+ "movaps %%xmm1, 16(%2,%0) \n\t"
+ "add $32, %1 \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(src1)
+ :"r"(dst), "r"(src0)
+ );
+}
+
+static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
+ const float *src2, int src3, int len, int step){
+ long i;
+ if(step == 2 && src3 == 0){
+ i = (len-4)*4;
+ dst += (len-4)*2;
+ asm volatile(
+ "1: \n\t"
+ "movq (%2,%0), %%mm0 \n\t"
+ "movq 8(%2,%0), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
+ "pfadd (%4,%0), %%mm0 \n\t"
+ "pfadd 8(%4,%0), %%mm1 \n\t"
+ "movd %%mm0, (%1) \n\t"
+ "movd %%mm1, 16(%1) \n\t"
+ "psrlq $32, %%mm0 \n\t"
+ "psrlq $32, %%mm1 \n\t"
+ "movd %%mm0, 8(%1) \n\t"
+ "movd %%mm1, 24(%1) \n\t"
+ "sub $32, %1 \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(dst)
+ :"r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
+ }
+ else if(step == 1 && src3 == 0){
+ for(i=0; i<len; i+=2){
+ asm volatile(
+ "movq %1, %%mm0 \n\t"
+ "pfmul %2, %%mm0 \n\t"
+ "pfadd %3, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"=m"(dst[i])
+ :"m"(src0[i]), "m"(src1[i]), "m"(src2[i])
+ );
+ }
+ }
+ else
+ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+ asm volatile("femms");
+}
+static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
+ const float *src2, float src3, int len, int step){
+ long i;
+ if(step == 2 && src3 == 0){
+ i = (len-8)*4;
+ dst += (len-8)*2;
+ asm volatile(
+ "1: \n\t"
+ "movaps (%2,%0), %%xmm0 \n\t"
+ "movaps 16(%2,%0), %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
+ "addps (%4,%0), %%xmm0 \n\t"
+ "addps 16(%4,%0), %%xmm1 \n\t"
+ "movd %%xmm0, (%1) \n\t"
+ "movd %%xmm1, 32(%1) \n\t"
+ "psrldq $4, %%xmm0 \n\t"
+ "psrldq $4, %%xmm1 \n\t"
+ "movd %%xmm0, 8(%1) \n\t"
+ "movd %%xmm1, 40(%1) \n\t"
+ "psrldq $4, %%xmm0 \n\t"
+ "psrldq $4, %%xmm1 \n\t"
+ "movd %%xmm0, 16(%1) \n\t"
+ "movd %%xmm1, 48(%1) \n\t"
+ "psrldq $4, %%xmm0 \n\t"
+ "psrldq $4, %%xmm1 \n\t"
+ "movd %%xmm0, 24(%1) \n\t"
+ "movd %%xmm1, 56(%1) \n\t"
+ "sub $64, %1 \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(dst)
+ :"r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
+ }
+ else if(step == 1 && src3 == 0){
+ for(i=0; i<len; i+=4){
+ asm volatile(
+ "movaps %1, %%xmm0 \n\t"
+ "mulps %2, %%xmm0 \n\t"
+ "addps %3, %%xmm0 \n\t"
+ "movaps %%xmm0, %0 \n\t"
+ :"=m"(dst[i])
+ :"m"(src0[i]), "m"(src1[i]), "m"(src2[i])
+ );
+ }
+ }
+ else
+ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+}
+
+void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
+ // not bit-exact: pf2id uses different rounding than C and SSE
+ int i;
+ for(i=0; i<len; i+=4) {
+ asm volatile(
+ "pf2id %1, %%mm0 \n\t"
+ "pf2id %2, %%mm1 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"=m"(dst[i])
+ :"m"(src[i]), "m"(src[i+2])
+ );
+ }
+ asm volatile("femms");
+}
+void float_to_int16_sse(int16_t *dst, const float *src, int len){
+ int i;
+ for(i=0; i<len; i+=4) {
+ asm volatile(
+ "cvtps2pi %1, %%mm0 \n\t"
+ "cvtps2pi %2, %%mm1 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"=m"(dst[i])
+ :"m"(src[i]), "m"(src[i+2])
+ );
+ }
+ asm volatile("emms");
+}
+
#ifdef CONFIG_SNOW_ENCODER
extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
@@ -3199,10 +3392,24 @@
}
#endif
- if(mm_flags & MM_SSE)
+ if(mm_flags & MM_3DNOW){
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+ c->vector_fmul = vector_fmul_3dnow;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT))
+ c->float_to_int16 = float_to_int16_3dnow;
+ }
+ if(mm_flags & MM_3DNOWEXT)
+ c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+ if(mm_flags & MM_SSE){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
- else if(mm_flags & MM_3DNOW)
- c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+ c->vector_fmul = vector_fmul_sse;
+ c->vector_fmul_add_add = vector_fmul_add_add_sse;
+ c->float_to_int16 = float_to_int16_sse;
+ }
+ if(mm_flags & MM_SSE2)
+ c->vector_fmul_reverse = vector_fmul_reverse_sse2;
+ if(mm_flags & MM_3DNOW)
+ c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
}
#ifdef CONFIG_ENCODERS
More information about the ffmpeg-devel
mailing list