[FFmpeg-cvslog] rv40: NEON optimised weak loop filter
Mans Rullgard
git at videolan.org
Sat Dec 17 02:23:02 CET 2011
ffmpeg | branch: master | Mans Rullgard <mans at mansr.com> | Fri Dec 9 21:21:26 2011 +0000| [11b1db27593a1f23a05e033f68b98a4342f1bd91] | committer: Mans Rullgard
rv40: NEON optimised weak loop filter
Signed-off-by: Mans Rullgard <mans at mansr.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=11b1db27593a1f23a05e033f68b98a4342f1bd91
---
libavcodec/arm/rv40dsp_init_neon.c | 9 +++
libavcodec/arm/rv40dsp_neon.S | 110 ++++++++++++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 0 deletions(-)
diff --git a/libavcodec/arm/rv40dsp_init_neon.c b/libavcodec/arm/rv40dsp_init_neon.c
index 59dddb6..898b841 100644
--- a/libavcodec/arm/rv40dsp_init_neon.c
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -61,6 +61,13 @@ int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, int stride,
int beta, int beta2, int edge,
int *p1, int *q1);
+void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, int stride, int filter_p1,
+ int filter_q1, int alpha, int beta,
+ int lim_p0q0, int lim_q1, int lim_p1);
+void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, int stride, int filter_p1,
+ int filter_q1, int alpha, int beta,
+ int lim_p0q0, int lim_q1, int lim_p1);
+
void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
{
c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
@@ -126,4 +133,6 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
+ c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon;
+ c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon;
}
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index d9e1b7c..f68f382 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -808,3 +808,113 @@ function ff_rv40_v_loop_filter_strength_neon, export=1
vmov.u16 r0, d0[0]
bx lr
endfunc
+
+.macro rv40_weak_loop_filter
+ vdup.16 d30, r2 @ filter_p1
+ vdup.16 d31, r3 @ filter_q1
+ ldrd r2, r3, [sp]
+ vdup.16 d28, r2 @ alpha
+ vdup.16 d29, r3 @ beta
+ ldr r12, [sp, #8]
+ vdup.16 d25, r12 @ lim_p0q0
+ ldrd r2, r3, [sp, #12]
+ vsubl.u8 q9, d5, d4 @ x, t
+ vabdl.u8 q8, d5, d4 @ x, abs(t)
+ vneg.s16 q15, q15
+ vceq.i16 d16, d19, #0 @ !t
+ vshl.s16 d19, d19, #2 @ t << 2
+ vmul.u16 d18, d17, d28 @ alpha * abs(t)
+ vand d24, d30, d31 @ filter_p1 & filter_q1
+ vsubl.u8 q1, d0, d4 @ p1p2, p1p0
+ vsubl.u8 q3, d1, d5 @ q1q2, q1q0
+ vmov.i16 d22, #3
+ vshr.u16 d18, d18, #7
+ vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
+ vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
+ vcle.u16 d18, d18, d22
+ vand d20, d20, d24
+ vneg.s16 d23, d25 @ -lim_p0q0
+ vadd.s16 d19, d19, d20
+ vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
+ vtrn.32 d4, d5 @ -3, 2, -1, 0
+ vrshr.s16 d19, d19, #3
+ vmov d28, d29 @ beta
+ vswp d3, d6 @ q1q2, p1p0
+ vmin.s16 d19, d19, d25
+ vand d30, d30, d16
+ vand d31, d31, d16
+ vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
+ vmax.s16 d19, d19, d23 @ diff
+ vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
+ vand d18, d19, d16 @ diff
+ vcle.u16 q1, q1, q14
+ vneg.s16 d19, d18 @ -diff
+ vdup.16 d26, r3 @ lim_p1
+ vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
+ vhsub.s16 q11, q10, q9
+ vand q1, q1, q15
+ vqmovun.s16 d4, q2 @ -1, 0
+ vand q9, q11, q1
+ vdup.16 d27, r2 @ lim_q1
+ vneg.s16 q9, q9
+ vneg.s16 q14, q13
+ vmin.s16 q9, q9, q13
+ vtrn.32 d0, d1 @ -2, 1, -2, 1
+ vmax.s16 q9, q9, q14
+ vaddw.u8 q3, q9, d0
+ vqmovun.s16 d5, q3 @ -2, 1
+.endm
+
+function ff_rv40_h_weak_loop_filter_neon, export=1
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d4[1]}, [r0,:32], r1
+ vld1.32 {d5[]}, [r0,:32], r1
+ vld1.32 {d1[]}, [r0,:32], r1
+ vld1.32 {d5[0]}, [r0,:32]
+
+ sub r0, r0, r1, lsl #2
+
+ rv40_weak_loop_filter
+
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r0,:32], r1
+
+ bx lr
+endfunc
+
+function ff_rv40_v_weak_loop_filter_neon, export=1
+ sub r12, r0, #3
+ sub r0, r0, #2
+
+ vld1.8 {d4}, [r12], r1
+ vld1.8 {d5}, [r12], r1
+ vld1.8 {d2}, [r12], r1
+ vld1.8 {d3}, [r12], r1
+
+ vtrn.16 q2, q1
+ vtrn.8 d4, d5
+ vtrn.8 d2, d3
+
+ vrev64.32 d5, d5
+ vtrn.32 q2, q1
+ vdup.32 d0, d3[0]
+ vdup.32 d1, d2[0]
+
+ rv40_weak_loop_filter
+
+ vtrn.32 q2, q3
+ vswp d4, d5
+
+ vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
+ vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
+ vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
+ vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
+
+ bx lr
+endfunc
More information about the ffmpeg-cvslog
mailing list