[FFmpeg-devel] [PATCH v2 15/16] swscale/aarch64/range_convert: update neon range_convert functions to new API
Ramiro Polla
ramiro.polla at gmail.com
Fri Sep 27 15:52:40 EEST 2024
A55:
chrRangeFromJpeg8_1920_c: 28833.8 ( 1.00x)
chrRangeFromJpeg8_1920_neon: 5309.9 ( 5.43x) 5313.1 ( 5.43x)
chrRangeToJpeg8_1920_c: 23070.6 ( 1.00x)
chrRangeToJpeg8_1920_neon: 5550.8 ( 4.16x) 5550.8 ( 4.16x)
lumRangeFromJpeg8_1920_c: 15388.1 ( 1.00x)
lumRangeFromJpeg8_1920_neon: 3148.1 ( 4.89x) 3145.8 ( 4.89x)
lumRangeToJpeg8_1920_c: 19226.2 ( 1.00x)
lumRangeToJpeg8_1920_neon: 3624.9 ( 5.30x) 3624.9 ( 5.30x)
A76:
chrRangeFromJpeg8_1920_c: 6317.8 ( 1.00x)
chrRangeFromJpeg8_1920_neon: 2343.8 ( 2.70x) 2304.2 ( 2.74x)
chrRangeToJpeg8_1920_c: 9287.1 ( 1.00x)
chrRangeToJpeg8_1920_neon: 2823.8 ( 3.29x) 2793.8 ( 3.32x)
lumRangeFromJpeg8_1920_c: 4359.1 ( 1.00x)
lumRangeFromJpeg8_1920_neon: 1105.8 ( 3.94x) 1105.0 ( 3.94x)
lumRangeToJpeg8_1920_c: 5957.2 ( 1.00x)
lumRangeToJpeg8_1920_neon: 1331.0 ( 4.48x) 1328.0 ( 4.49x)
---
libswscale/aarch64/range_convert_neon.S | 59 +++++++++++++------------
libswscale/aarch64/swscale.c | 17 ++++---
2 files changed, 39 insertions(+), 37 deletions(-)
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
index 2f418adb24..1aadd8e04d 100644
--- a/libswscale/aarch64/range_convert_neon.S
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -20,12 +20,13 @@
#include "libavutil/aarch64/asm.S"
-.macro lumConvertRange name, fromto, mult, offset, shift
-function ff_\name, export=1
- mov w3, #\mult
- dup v25.4s, w3
- movz w3, #(\offset & 0xffff)
- movk w3, #((\offset >> 16) & 0xffff), lsl #16
+.macro lumConvertRange fromto
+function ff_lumRange\fromto\()Jpeg_neon, export=1
+// x0 int16_t *dst
+// w1 int width
+// w2 int coeff
+// x3 int64_t offset
+ dup v25.4s, w2
dup v26.4s, w3
1:
ld1 {v0.8h}, [x0]
@@ -36,11 +37,11 @@ function ff_\name, export=1
mla v16.4s, v20.4s, v25.4s
mla v18.4s, v22.4s, v25.4s
.ifc \fromto, To
- sqshrn v0.4h, v16.4s, #\shift
- sqshrn2 v0.8h, v18.4s, #\shift
+ sqshrn v0.4h, v16.4s, 14
+ sqshrn2 v0.8h, v18.4s, 14
.else
- shrn v0.4h, v16.4s, #\shift
- shrn2 v0.8h, v18.4s, #\shift
+ shrn v0.4h, v16.4s, 14
+ shrn2 v0.8h, v18.4s, 14
.endif
subs w1, w1, #8
st1 {v0.8h}, [x0], #16
@@ -49,13 +50,15 @@ function ff_\name, export=1
endfunc
.endm
-.macro chrConvertRange name, fromto, mult, offset, shift
-function ff_\name, export=1
- mov w3, #\mult
+.macro chrConvertRange fromto
+function ff_chrRange\fromto\()Jpeg_neon, export=1
+// x0 int16_t *dstU
+// x1 int16_t *dstV
+// w2 int width
+// w3 int coeff
+// x4 int64_t offset
dup v25.4s, w3
- movz w3, #(\offset & 0xffff)
- movk w3, #((\offset >> 16) & 0xffff), lsl #16
- dup v26.4s, w3
+ dup v26.4s, w4
1:
ld1 {v0.8h}, [x0]
ld1 {v1.8h}, [x1]
@@ -72,15 +75,15 @@ function ff_\name, export=1
mla v18.4s, v22.4s, v25.4s
mla v19.4s, v23.4s, v25.4s
.ifc \fromto, To
- sqshrn v0.4h, v16.4s, #\shift
- sqshrn v1.4h, v17.4s, #\shift
- sqshrn2 v0.8h, v18.4s, #\shift
- sqshrn2 v1.8h, v19.4s, #\shift
+ sqshrn v0.4h, v16.4s, 14
+ sqshrn v1.4h, v17.4s, 14
+ sqshrn2 v0.8h, v18.4s, 14
+ sqshrn2 v1.8h, v19.4s, 14
.else
- shrn v0.4h, v16.4s, #\shift
- shrn v1.4h, v17.4s, #\shift
- shrn2 v0.8h, v18.4s, #\shift
- shrn2 v1.8h, v19.4s, #\shift
+ shrn v0.4h, v16.4s, 14
+ shrn v1.4h, v17.4s, 14
+ shrn2 v0.8h, v18.4s, 14
+ shrn2 v1.8h, v19.4s, 14
.endif
subs w2, w2, #8
st1 {v0.8h}, [x0], #16
@@ -90,7 +93,7 @@ function ff_\name, export=1
endfunc
.endm
-lumConvertRange lumRangeToJpeg_neon, To, 19077, -39057361, 14
-chrConvertRange chrRangeToJpeg_neon, To, 4663, -9289992, 12
-lumConvertRange lumRangeFromJpeg_neon, From, 14071, 33561947, 14
-chrConvertRange chrRangeFromJpeg_neon, From, 1799, 4081085, 11
+lumConvertRange To
+chrConvertRange To
+lumConvertRange From
+chrConvertRange From
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 94059cec51..98f07ecfe5 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -218,17 +218,17 @@ NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
-void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
-void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
-void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
-void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
+ int coeff, int64_t offset);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+ int coeff, int64_t offset);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
+ int coeff, int64_t offset);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+ int coeff, int64_t offset);
av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
{
- /* This code is currently disabled because of changes in the base
- * implementation of these functions. This code should be enabled
- * again once those changes are ported to this architecture. */
-#if 0
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
@@ -242,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
}
}
}
-#endif
}
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
--
2.30.2
More information about the ffmpeg-devel
mailing list