[FFmpeg-cvslog] Change DSPContext.vector_fmul() from dst=dst*src to dest=src0*src1.
Justin Ruggles
git
Sun Jan 23 21:01:55 CET 2011
ffmpeg | branch: master | Justin Ruggles <justin.ruggles at gmail.com> | Thu Jan 13 15:28:06 2011 -0500| [015f9f1ad379745fe02ba219a83c406fdeaf37be] | committer: Michael Niedermayer
Change DSPContext.vector_fmul() from dst=dst*src to dest=src0*src1.
Signed-off-by: Mans Rullgard <mans at mansr.com>
(cherry picked from commit 6eabb0d3ad42b91c1b4c298718c29961f7c1653a)
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=015f9f1ad379745fe02ba219a83c406fdeaf37be
---
libavcodec/aacenc.c | 2 +-
libavcodec/arm/dsputil_init_neon.c | 2 +-
libavcodec/arm/dsputil_init_vfp.c | 3 +-
libavcodec/arm/dsputil_neon.S | 45 +++++++++++++++++------------------
libavcodec/arm/dsputil_vfp.S | 29 +++++++++++------------
libavcodec/atrac3.c | 2 +-
libavcodec/dsputil.c | 4 +-
libavcodec/dsputil.h | 2 +-
libavcodec/nellymoserenc.c | 6 ++--
libavcodec/ppc/float_altivec.c | 10 ++++----
libavcodec/twinvq.c | 4 +-
libavcodec/vorbis_dec.c | 2 +-
libavcodec/x86/dsputil_mmx.c | 24 +++++++++---------
13 files changed, 67 insertions(+), 68 deletions(-)
diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index f8f0eb4..6a113ef 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -256,7 +256,7 @@ static void apply_window_and_mdct(AVCodecContext *avctx, AACEncContext *s,
s->output[i - 448 - k] = (i < 1024)
? sce->saved[i]
: audio[(i-1024)*chans];
- s->dsp.vector_fmul (s->output, k ? swindow : pwindow, 128);
+ s->dsp.vector_fmul (s->output, s->output, k ? swindow : pwindow, 128);
s->dsp.vector_fmul_reverse(s->output+128, s->output+128, swindow, 128);
ff_mdct_calc(&s->mdct128, sce->coeffs + k, s->output);
}
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 04ebb00..221183c 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -138,7 +138,7 @@ void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
-void ff_vector_fmul_neon(float *dst, const float *src, int len);
+void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
void ff_vector_fmul_window_neon(float *dst, const float *src0,
const float *src1, const float *win,
float add_bias, int len);
diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c
index 9f8c1b7..76ef6b4 100644
--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@@ -21,7 +21,8 @@
#include "libavcodec/dsputil.h"
#include "dsputil_arm.h"
-void ff_vector_fmul_vfp(float *dst, const float *src, int len);
+void ff_vector_fmul_vfp(float *dst, const float *src0,
+ const float *src1, int len);
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 2bcdb39..42fb38d 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -738,42 +738,41 @@ function ff_float_to_int16_interleave_neon, export=1
endfunc
function ff_vector_fmul_neon, export=1
- mov r3, r0
- subs r2, r2, #8
- vld1.64 {d0-d3}, [r0,:128]!
- vld1.64 {d4-d7}, [r1,:128]!
+ subs r3, r3, #8
+ vld1.64 {d0-d3}, [r1,:128]!
+ vld1.64 {d4-d7}, [r2,:128]!
vmul.f32 q8, q0, q2
vmul.f32 q9, q1, q3
beq 3f
- bics ip, r2, #15
+ bics ip, r3, #15
beq 2f
1: subs ip, ip, #16
- vld1.64 {d0-d1}, [r0,:128]!
- vld1.64 {d4-d5}, [r1,:128]!
+ vld1.64 {d0-d1}, [r1,:128]!
+ vld1.64 {d4-d5}, [r2,:128]!
vmul.f32 q10, q0, q2
- vld1.64 {d2-d3}, [r0,:128]!
- vld1.64 {d6-d7}, [r1,:128]!
+ vld1.64 {d2-d3}, [r1,:128]!
+ vld1.64 {d6-d7}, [r2,:128]!
vmul.f32 q11, q1, q3
- vst1.64 {d16-d19},[r3,:128]!
- vld1.64 {d0-d1}, [r0,:128]!
- vld1.64 {d4-d5}, [r1,:128]!
+ vst1.64 {d16-d19},[r0,:128]!
+ vld1.64 {d0-d1}, [r1,:128]!
+ vld1.64 {d4-d5}, [r2,:128]!
vmul.f32 q8, q0, q2
- vld1.64 {d2-d3}, [r0,:128]!
- vld1.64 {d6-d7}, [r1,:128]!
+ vld1.64 {d2-d3}, [r1,:128]!
+ vld1.64 {d6-d7}, [r2,:128]!
vmul.f32 q9, q1, q3
- vst1.64 {d20-d23},[r3,:128]!
+ vst1.64 {d20-d23},[r0,:128]!
bne 1b
- ands r2, r2, #15
+ ands r3, r3, #15
beq 3f
-2: vld1.64 {d0-d1}, [r0,:128]!
- vld1.64 {d4-d5}, [r1,:128]!
- vst1.64 {d16-d17},[r3,:128]!
+2: vld1.64 {d0-d1}, [r1,:128]!
+ vld1.64 {d4-d5}, [r2,:128]!
+ vst1.64 {d16-d17},[r0,:128]!
vmul.f32 q8, q0, q2
- vld1.64 {d2-d3}, [r0,:128]!
- vld1.64 {d6-d7}, [r1,:128]!
- vst1.64 {d18-d19},[r3,:128]!
+ vld1.64 {d2-d3}, [r1,:128]!
+ vld1.64 {d6-d7}, [r2,:128]!
+ vst1.64 {d18-d19},[r0,:128]!
vmul.f32 q9, q1, q3
-3: vst1.64 {d16-d19},[r3,:128]!
+3: vst1.64 {d16-d19},[r0,:128]!
bx lr
endfunc
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index b704ba9..a65b69e 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -41,34 +41,33 @@
* ARM VFP optimized implementation of 'vector_fmul_c' function.
* Assume that len is a positive number and is multiple of 8
*/
-@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
+@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
function ff_vector_fmul_vfp, export=1
vpush {d8-d15}
- mov r3, r0
fmrx r12, fpscr
orr r12, r12, #(3 << 16) /* set vector size to 4 */
fmxr fpscr, r12
- vldmia r3!, {s0-s3}
- vldmia r1!, {s8-s11}
- vldmia r3!, {s4-s7}
- vldmia r1!, {s12-s15}
+ vldmia r1!, {s0-s3}
+ vldmia r2!, {s8-s11}
+ vldmia r1!, {s4-s7}
+ vldmia r2!, {s12-s15}
vmul.f32 s8, s0, s8
1:
- subs r2, r2, #16
+ subs r3, r3, #16
vmul.f32 s12, s4, s12
- vldmiage r3!, {s16-s19}
- vldmiage r1!, {s24-s27}
- vldmiage r3!, {s20-s23}
- vldmiage r1!, {s28-s31}
+ vldmiage r1!, {s16-s19}
+ vldmiage r2!, {s24-s27}
+ vldmiage r1!, {s20-s23}
+ vldmiage r2!, {s28-s31}
vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15}
vmulge.f32 s28, s20, s28
- vldmiagt r3!, {s0-s3}
- vldmiagt r1!, {s8-s11}
- vldmiagt r3!, {s4-s7}
- vldmiagt r1!, {s12-s15}
+ vldmiagt r1!, {s0-s3}
+ vldmiagt r2!, {s8-s11}
+ vldmiagt r1!, {s4-s7}
+ vldmiagt r2!, {s12-s15}
vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31}
diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c
index 797e1f1..cc13b73 100644
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -159,7 +159,7 @@ static void IMLT(ATRAC3Context *q, float *pInput, float *pOutput, int odd_band)
ff_imdct_calc(&q->mdct_ctx,pOutput,pInput);
/* Perform windowing on the output. */
- dsp.vector_fmul(pOutput,mdct_window,512);
+ dsp.vector_fmul(pOutput, pOutput, mdct_window, 512);
}
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 57b2640..2ed0052 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3750,10 +3750,10 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
-static void vector_fmul_c(float *dst, const float *src, int len){
+static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
int i;
for(i=0; i<len; i++)
- dst[i] *= src[i];
+ dst[i] = src0[i] * src1[i];
}
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 842d727..baa68be 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -375,7 +375,7 @@ typedef struct DSPContext {
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
- void (*vector_fmul)(float *dst, const float *src, int len);
+ void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len);
void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
/* assume len is a multiple of 8, and src arrays are 16-byte aligned */
void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index b3f6aa3..03b647e 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -113,13 +113,13 @@ static const uint8_t quant_lut_offset[8] = { 0, 0, 1, 4, 11, 32, 81, 230 };
static void apply_mdct(NellyMoserEncodeContext *s)
{
- memcpy(s->in_buff, s->buf[s->bufsel], NELLY_BUF_LEN * sizeof(float));
- s->dsp.vector_fmul(s->in_buff, ff_sine_128, NELLY_BUF_LEN);
+ s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN);
s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128,
NELLY_BUF_LEN);
ff_mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
- s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, NELLY_BUF_LEN);
+ s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN,
+ ff_sine_128, NELLY_BUF_LEN);
s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128,
NELLY_BUF_LEN);
ff_mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN);
diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c
index d1f9f1a..188e03e 100644
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -23,16 +23,16 @@
#include "dsputil_altivec.h"
#include "util_altivec.h"
-static void vector_fmul_altivec(float *dst, const float *src, int len)
+static void vector_fmul_altivec(float *dst, const float *src0, const float *src1, int len)
{
int i;
vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
for(i=0; i<len-7; i+=8) {
- d0 = vec_ld(0, dst+i);
- s = vec_ld(0, src+i);
- d1 = vec_ld(16, dst+i);
+ d0 = vec_ld(0, src0+i);
+ s = vec_ld(0, src1+i);
+ d1 = vec_ld(16, src0+i);
d0 = vec_madd(d0, s, zero);
- d1 = vec_madd(d1, vec_ld(16,src+i), zero);
+ d1 = vec_madd(d1, vec_ld(16,src1+i), zero);
vec_st(d0, 0, dst+i);
vec_st(d1, 16, dst+i);
}
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index 3d26c6e..15907ae 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -783,7 +783,7 @@ static void read_and_decode_spectrum(TwinContext *tctx, GetBitContext *gb,
dec_bark_env(tctx, bark1[i][j], bark_use_hist[i][j], i,
tctx->tmp_buf, gain[sub*i+j], ftype);
- tctx->dsp.vector_fmul(chunk + block_size*j, tctx->tmp_buf,
+ tctx->dsp.vector_fmul(chunk + block_size*j, chunk + block_size*j, tctx->tmp_buf,
block_size);
}
@@ -805,7 +805,7 @@ static void read_and_decode_spectrum(TwinContext *tctx, GetBitContext *gb,
dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf);
for (j = 0; j < mtab->fmode[ftype].sub; j++) {
- tctx->dsp.vector_fmul(chunk, tctx->tmp_buf, block_size);
+ tctx->dsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
chunk += block_size;
}
}
diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c
index 4e16c4a..8f15a21 100644
--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -1578,7 +1578,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc)
for (j = vc->audio_channels-1;j >= 0; j--) {
ch_floor_ptr = vc->channel_floors + j * blocksize / 2;
ch_res_ptr = vc->channel_residues + res_chan[j] * blocksize / 2;
- vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize / 2);
+ vc->dsp.vector_fmul(ch_floor_ptr, ch_floor_ptr, ch_res_ptr, blocksize / 2);
ff_imdct_half(&vc->mdct[blockflag], ch_res_ptr, ch_floor_ptr);
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 825149e..5ddfeca 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2074,38 +2074,38 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_c
}
}
-static void vector_fmul_3dnow(float *dst, const float *src, int len){
+static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
x86_reg i = (len-4)*4;
__asm__ volatile(
"1: \n\t"
- "movq (%1,%0), %%mm0 \n\t"
- "movq 8(%1,%0), %%mm1 \n\t"
- "pfmul (%2,%0), %%mm0 \n\t"
- "pfmul 8(%2,%0), %%mm1 \n\t"
+ "movq (%2,%0), %%mm0 \n\t"
+ "movq 8(%2,%0), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
"movq %%mm0, (%1,%0) \n\t"
"movq %%mm1, 8(%1,%0) \n\t"
"sub $16, %0 \n\t"
"jge 1b \n\t"
"femms \n\t"
:"+r"(i)
- :"r"(dst), "r"(src)
+ :"r"(dst), "r"(src0), "r"(src1)
:"memory"
);
}
-static void vector_fmul_sse(float *dst, const float *src, int len){
+static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
x86_reg i = (len-8)*4;
__asm__ volatile(
"1: \n\t"
- "movaps (%1,%0), %%xmm0 \n\t"
- "movaps 16(%1,%0), %%xmm1 \n\t"
- "mulps (%2,%0), %%xmm0 \n\t"
- "mulps 16(%2,%0), %%xmm1 \n\t"
+ "movaps (%2,%0), %%xmm0 \n\t"
+ "movaps 16(%2,%0), %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
"movaps %%xmm0, (%1,%0) \n\t"
"movaps %%xmm1, 16(%1,%0) \n\t"
"sub $32, %0 \n\t"
"jge 1b \n\t"
:"+r"(i)
- :"r"(dst), "r"(src)
+ :"r"(dst), "r"(src0), "r"(src1)
:"memory"
);
}
More information about the ffmpeg-cvslog
mailing list