[FFmpeg-devel] [PATCH] avfilter/scene_sad: add AArch64 SIMD
quinkblack at foxmail.com
quinkblack at foxmail.com
Sat Feb 1 11:57:17 EET 2020
From: Zhao Zhili <quinkblack at foxmail.com>
For 8 bit depth:
./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null -benchmark -
Test results on Snapdragon 845:
Before:
frame= 250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.924x
bench: utime=8.360s stime=2.350s rtime=10.820s
After:
frame= 250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=2.04x
bench: utime=2.650s stime=2.210s rtime=4.909s
Test results on HiSilicon Kirin 970:
Before:
frame= 250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.239x
bench: utime=35.156s stime=6.604s rtime=41.820s
After:
frame= 250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.403x
bench: utime=18.400s stime=6.376s rtime=24.798s
For 16 bit depth:
./ffmpeg -threads 1 -f lavfi -t 10 -i 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null -benchmark -
Test results on Snapdragon 845
Before:
frame= 250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.756x
bench: utime=8.700s stime=4.410s rtime=13.226s
After:
frame= 250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=1.07x
bench: utime=4.920s stime=4.350s rtime=9.356s
Test results on HiSilicon Kirin 970:
Before:
frame= 250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.161x
bench: utime=48.868s stime=13.124s rtime=62.110s
After:
frame= 250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A speed=0.205x
bench: utime=35.600s stime=13.036s rtime=48.708s
---
libavfilter/aarch64/Makefile | 2 +
libavfilter/aarch64/scene_sad_init.c | 37 +++++++
libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
libavfilter/scene_sad.c | 2 +
libavfilter/scene_sad.h | 2 +
5 files changed, 192 insertions(+)
create mode 100644 libavfilter/aarch64/scene_sad_init.c
create mode 100644 libavfilter/aarch64/scene_sad_neon.S
diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index 6c727f9859..3a458f511f 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,7 +1,9 @@
OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_afir_init.o
OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_anlmdn_init.o
+OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/scene_sad_init.o
OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o
NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_afir_neon.o
NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_anlmdn_neon.o
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/scene_sad_neon.o
NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/scene_sad_init.c b/libavfilter/aarch64/scene_sad_init.c
new file mode 100644
index 0000000000..8de769ac10
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_init.c
@@ -0,0 +1,37 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/scene_sad.h"
+
+void ff_scene_sad_neon(SCENE_SAD_PARAMS);
+
+void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
+
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (have_neon(cpu_flags)) {
+ if (depth == 8)
+ return ff_scene_sad_neon;
+ if (depth == 16)
+ return ff_scene_sad16_neon;
+ }
+
+ return NULL;
+}
diff --git a/libavfilter/aarch64/scene_sad_neon.S b/libavfilter/aarch64/scene_sad_neon.S
new file mode 100644
index 0000000000..5b3b027a53
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_neon.S
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Zhao Zhili
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
+// const uint8_t *src2, ptrdiff_t stride2,
+// ptrdiff_t width, ptrdiff_t height,
+// uint64_t *sum)
+.macro scene_sad_neon, depth=8
+ // x0: src1
+ // x1: stride1
+ // x2: src2
+ // x3: stride2
+ // x4: width
+ // x5: height
+ // x6: sum
+
+ // x7: step of width loop
+ // x8: index of row
+ // x9: width / x7 * x7
+ // x10: sad
+ // x11: index of column
+ // w12: src1[x]
+ // w13: src2[x]
+
+ mov x8, xzr
+ mov x10, xzr
+
+.if \depth == 8
+ mov x7, #64
+ and x9, x4, #0xFFFFFFFFFFFFFFC0
+.endif
+
+.if \depth == 16
+ mov x7, #32
+ and x9, x4, #0xFFFFFFFFFFFFFFE0
+.endif
+
+1: cmp x4, x7 // check width
+ mov x11, xzr
+ b.lt 3f
+
+ mov v0.d[0], x10
+
+ // vector loop
+2:
+.if \depth == 8
+ add x14, x0, x11
+ add x15, x2, x11
+.endif
+
+.if \depth == 16
+ add x14, x0, x11, lsl #1
+ add x15, x2, x11, lsl #1
+.endif
+ ld1 {v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
+ ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
+ add x11, x11, x7
+ cmp x9, x11
+
+.if \depth == 8
+ uabd v16.16B, v16.16B, v20.16B
+ uabd v17.16B, v17.16B, v21.16B
+ uabd v18.16B, v18.16B, v22.16B
+ uabd v19.16B, v19.16B, v23.16B
+ uaddlv h16, v16.16B
+ uaddlv h17, v17.16B
+ uaddlv h18, v18.16B
+ uaddlv h19, v19.16B
+.endif
+
+.if \depth == 16
+ uabd v16.8H, v16.8H, v20.8H
+ uabd v17.8H, v17.8H, v21.8H
+ uabd v18.8H, v18.8H, v22.8H
+ uabd v19.8H, v19.8H, v23.8H
+ uaddlv s16, v16.8H
+ uaddlv s17, v17.8H
+ uaddlv s18, v18.8H
+ uaddlv s19, v19.8H
+.endif
+
+ add d16, d16, d17
+ add d18, d18, d19
+ add d0, d0, d16
+ add d0, d0, d18
+
+ b.ne 2b
+
+ cmp x9, x4
+ fmov x10, d0
+ b.eq 4f
+
+ // scalar loop
+3:
+.if \depth == 8
+ ldrb w12, [x0, x11]
+ ldrb w13, [x2, x11]
+.endif
+
+.if \depth == 16
+ ldrh w12, [x0, x11, lsl #1]
+ ldrh w13, [x2, x11, lsl #1]
+.endif
+ add x11, x11, #1
+ subs w12, w12, w13
+ cneg w12, w12, mi
+ add x10, x10, x12
+ cmp x11, x4
+ b.ne 3b
+
+ // next row
+4:
+ add x8, x8, #1 // =1
+ add x0, x0, x1
+ cmp x8, x5
+ add x2, x2, x3
+ b.ne 1b
+
+5:
+ str x10, [x6]
+ ret
+.endm
+
+function ff_scene_sad_neon, export=1
+ scene_sad_neon depth=8
+endfunc
+
+function ff_scene_sad16_neon, export=1
+ scene_sad_neon depth=16
+endfunc
diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
index 73d3eacbfa..ee0c71f659 100644
--- a/libavfilter/scene_sad.c
+++ b/libavfilter/scene_sad.c
@@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
ff_scene_sad_fn sad = NULL;
if (ARCH_X86)
sad = ff_scene_sad_get_fn_x86(depth);
+ if (ARCH_AARCH64)
+ sad = ff_scene_sad_get_fn_aarch64(depth);
if (!sad) {
if (depth == 8)
sad = ff_scene_sad_c;
diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
index 173a051f2b..c868200dc4 100644
--- a/libavfilter/scene_sad.h
+++ b/libavfilter/scene_sad.h
@@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS);
void ff_scene_sad16_c(SCENE_SAD_PARAMS);
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
+
ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
--
2.22.0
More information about the ffmpeg-devel
mailing list