[FFmpeg-cvslog] h264/aarch64: optimize neon loop filter
Janne Grunau
git at videolan.org
Wed Feb 20 20:46:50 EET 2019
ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Tue Jan 1 22:37:11 2019 +0100| [846c3d6aca5484904e60946c4fe8b8833bc07f92] | committer: Janne Grunau
h264/aarch64: optimize neon loop filter
Exit as soon as possible if no filtering will be done.
Improves the checkasm --bench cycle count on a Snapdragon 820e:
h264_h_loop_filter_luma_8bpp_c: 72.4 -> 72.5
h264_h_loop_filter_luma_8bpp_neon: 97.1 -> 56.3
h264_v_loop_filter_luma_8bpp_c: 174.0 -> 173.5
h264_v_loop_filter_luma_8bpp_neon: 62.9 -> 60.9
h264_h_loop_filter_chroma_8bpp_c: 30.2 -> 30.3
h264_h_loop_filter_chroma_8bpp_neon: 51.6 -> 25.7
h264_v_loop_filter_chroma_8bpp_c: 57.3 -> 57.3
h264_v_loop_filter_chroma_8bpp_neon: 28.0 -> 24.0
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=846c3d6aca5484904e60946c4fe8b8833bc07f92
---
libavcodec/aarch64/h264dsp_neon.S | 33 +++++++++++++++++++--------------
1 file changed, 19 insertions(+), 14 deletions(-)
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 60ffa24500..b649f1d018 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -54,9 +54,12 @@
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
+ and v21.16B, v21.16B, v30.16B // < beta
+ shrn v30.8b, v21.8h, #4
+ mov x7, v30.d[0]
cmhi v17.16B, v22.16B, v17.16B // < beta
- and v21.16B, v21.16B, v30.16B
cmhi v19.16B, v22.16B, v19.16B // < beta
+ cbz x7, 9f
and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B
@@ -124,7 +127,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0]
-
+9:
ret
endfunc
@@ -174,32 +177,34 @@ function ff_h264_h_loop_filter_luma_neon, export=1
st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1
-
+9:
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
+ dup v23.8B, w3 // beta
uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
- uxtl v4.8H, v0.8B
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
+ uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
+ cmhi v26.8B, v22.8B, v26.8B // < alpha
+ cmhi v28.8B, v23.8B, v28.8B // < beta
+ cmhi v30.8B, v23.8B, v30.8B // < beta
+ uxtl v4.8H, v0.8B
+ and v26.8B, v26.8B, v28.8B
usubw v4.8H, v4.8H, v16.8B
- sli v24.8H, v24.8H, #8
+ and v26.8B, v26.8B, v30.8B
shl v4.8H, v4.8H, #2
- uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
+ mov x2, v26.d[0]
+ sli v24.8H, v24.8H, #8
uaddw v4.8H, v4.8H, v18.8B
- cmhi v26.8B, v22.8B, v26.8B // < alpha
+ cbz x2, 9f
usubw v4.8H, v4.8H, v2.8B
- dup v22.8B, w3 // beta
rshrn v4.8B, v4.8H, #3
- cmhi v28.8B, v22.8B, v28.8B // < beta
- cmhi v30.8B, v22.8B, v30.8B // < beta
smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B
- and v26.8B, v26.8B, v28.8B
smax v4.8B, v4.8B, v25.8B
- and v26.8B, v26.8B, v30.8B
uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B
@@ -224,7 +229,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1
sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1
-
+9:
ret
endfunc
@@ -257,7 +262,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1
-
+9:
ret
endfunc
More information about the ffmpeg-cvslog
mailing list