[FFmpeg-cvslog] vp3: integrate clear_blocks with idct of previous block.
Ronald S. Bultje
git at videolan.org
Sun Jan 20 14:02:00 CET 2013
ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Fri Jan 18 16:43:04 2013 +0100| [aeaf268e52fc11c1f64914a319e0edddf1346d6a] | committer: Ronald S. Bultje
vp3: integrate clear_blocks with idct of previous block.
This is identical to what e.g. vp8 does, and prevents the function call
overhead (plus dependency on dsputil for this particular function).
Arm asm updated by Janne Grunau <janne-libav at jannau.net>.
Signed-off-by: Janne Grunau <janne-libav at jannau.net>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=aeaf268e52fc11c1f64914a319e0edddf1346d6a
---
libavcodec/arm/vp3dsp_neon.S | 22 +++++++++++++++-------
libavcodec/ppc/vp3dsp_altivec.c | 2 ++
libavcodec/vp3.c | 5 ++---
libavcodec/vp3dsp.c | 5 ++++-
libavcodec/vp3dsp.h | 2 +-
libavcodec/x86/vp3dsp.asm | 27 ++++++++++++++++++++-------
libavcodec/x86/vp3dsp_init.c | 2 +-
7 files changed, 45 insertions(+), 20 deletions(-)
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index e09de57..e5ecfc3 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -108,14 +108,20 @@ endfunc
function vp3_idct_start_neon
vpush {d8-d15}
+ vmov.i16 q4, #0
+ vmov.i16 q5, #0
movrel r3, vp3_idct_constants
vld1.64 {d0-d1}, [r3,:128]
- vld1.64 {d16-d19}, [r2,:128]!
- vld1.64 {d20-d23}, [r2,:128]!
- vld1.64 {d24-d27}, [r2,:128]!
+ vld1.64 {d16-d19}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vld1.64 {d20-d23}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vld1.64 {d24-d27}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
vadd.s16 q1, q8, q12
vsub.s16 q8, q8, q12
- vld1.64 {d28-d31}, [r2,:128]!
+ vld1.64 {d28-d31}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
vp3_idct_core_neon:
vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
@@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1
endfunc
function ff_vp3_idct_dc_add_neon, export=1
- ldrsh r2, [r2]
+ ldrsh r12, [r2]
mov r3, r0
- add r2, r2, #15
- vdup.16 q15, r2
+ add r12, r12, #15
+ vdup.16 q15, r12
+ mov r12, 0
+ strh r12, [r2]
vshr.s16 q15, q15, #5
vld1.8 {d0}, [r0,:64], r1
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index 75a3677..6adf9ae 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
PUT(b5) dst += stride;
PUT(b6) dst += stride;
PUT(b7)
+ memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
@@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
ADD(b5) dst += stride;
ADD(b6) dst += stride;
ADD(b7)
+ memset(block, 0, sizeof(*block) * 64);
}
#endif /* HAVE_ALTIVEC */
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 0340c22..9417535 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext {
DSPContext dsp;
VideoDSPContext vdsp;
VP3DSPContext vp3dsp;
+ DECLARE_ALIGNED(16, DCTELEM, block)[64];
int flipped_image;
int last_slice_end;
int skip_loop_filter;
@@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int
static void render_slice(Vp3DecodeContext *s, int slice)
{
int x, y, i, j, fragment;
- LOCAL_ALIGNED_16(DCTELEM, block, [64]);
+ DCTELEM *block = s->block;
int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef;
int motion_halfpel_index;
uint8_t *motion_source;
@@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
}
}
- s->dsp.clear_block(block);
-
/* invert DCT and place (or add) in final output */
if (s->all_fragments[i].coding_method == MODE_INTRA) {
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index 9b0b5d0..9e6209d 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
idct(dest, line_size, block, 1);
+ memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
idct(dest, line_size, block, 2);
+ memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
- const DCTELEM *block/*align 16*/){
+ DCTELEM *block/*align 16*/){
int i, dc = (block[0] + 15) >> 5;
for(i = 0; i < 8; i++){
@@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
dest[7] = av_clip_uint8(dest[7] + dc);
dest += line_size;
}
+ block[0] = 0;
}
static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride,
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 3781bbf..feb3000 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -25,7 +25,7 @@
typedef struct VP3DSPContext {
void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block);
- void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block);
+ void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block);
void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc1e776..d2c464c 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -562,6 +562,13 @@ cglobal vp3_idct_put, 3, 4, 9
%endif
%assign %%i %%i+64
%endrep
+
+ pxor m0, m0
+%assign %%offset 0
+%rep 128/mmsize
+ mova [r2+%%offset], m0
+%assign %%offset %%offset+mmsize
+%endrep
RET
cglobal vp3_idct_add, 3, 4, 9
@@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
+%assign %%offset 0
+%rep 32/mmsize
+ mova [r2+%%offset], m4
+%assign %%offset %%offset+mmsize
+%endrep
add r2, 32
dec r3
jg .loop
@@ -620,7 +632,7 @@ vp3_idct_funcs
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
- movq m5, [r0+r3 ]
+ movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
@@ -630,7 +642,7 @@ vp3_idct_funcs
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
- movq [r0+r3 ], m5
+ movq [r0+r2 ], m5
%endmacro
INIT_MMX mmxext
@@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
- lea r3, [r1*3]
- movsx r2, word [r2]
- add r2, 15
- sar r2, 5
- movd m0, r2d
+ movsx r3, word [r2]
+ mov word [r2], 0
+ lea r2, [r1*3]
+ add r3, 15
+ sar r3, 5
+ movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index bbe74ba..95beeab 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
- const DCTELEM *block);
+ DCTELEM *block);
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
More information about the ffmpeg-cvslog
mailing list