[FFmpeg-cvslog] lavc/aarch64: reformat add_res funcs
J. Dekker
git at videolan.org
Tue Aug 16 15:01:15 EEST 2022
ffmpeg | branch: master | J. Dekker <jdek at itanimul.li> | Thu Jun 23 20:04:06 2022 +0200| [aa9eabb7a5283fd90b3274ac4b6ba0d16e4aaaa2] | committer: J. Dekker
lavc/aarch64: reformat add_res funcs
Signed-off-by: J. Dekker <jdek at itanimul.li>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=aa9eabb7a5283fd90b3274ac4b6ba0d16e4aaaa2
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 216 ++++++++++++++++-----------------
1 file changed, 108 insertions(+), 108 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..484eea8437 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -27,21 +27,21 @@
#include "libavutil/aarch64/asm.S"
const trans, align=4
- .short 64, 83, 64, 36
- .short 89, 75, 50, 18
- .short 90, 87, 80, 70
- .short 57, 43, 25, 9
- .short 90, 90, 88, 85
- .short 82, 78, 73, 67
- .short 61, 54, 46, 38
- .short 31, 22, 13, 4
+ .short 64, 83, 64, 36
+ .short 89, 75, 50, 18
+ .short 90, 87, 80, 70
+ .short 57, 43, 25, 9
+ .short 90, 90, 88, 85
+ .short 82, 78, 73, 67
+ .short 61, 54, 46, 38
+ .short 31, 22, 13, 4
endconst
.macro clip10 in1, in2, c1, c2
- smax \in1, \in1, \c1
- smax \in2, \in2, \c1
- smin \in1, \in1, \c2
- smin \in2, \in2, \c2
+ smax \in1, \in1, \c1
+ smax \in2, \in2, \c1
+ smin \in1, \in1, \c2
+ smin \in2, \in2, \c2
.endm
function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
ld1 {v2.s}[1], [x0], x2
ld1 {v2.s}[2], [x0], x2
ld1 {v2.s}[3], [x0], x2
- sub x0, x0, x2, lsl #2
- uxtl v6.8h, v2.8b
- uxtl2 v7.8h, v2.16b
- sqadd v0.8h, v0.8h, v6.8h
- sqadd v1.8h, v1.8h, v7.8h
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
+ sub x0, x0, x2, lsl #2
+ uxtl v6.8h, v2.8b
+ uxtl2 v7.8h, v2.16b
+ sqadd v0.8h, v0.8h, v6.8h
+ sqadd v1.8h, v1.8h, v7.8h
+ sqxtun v0.8b, v0.8h
+ sqxtun2 v0.16b, v1.8h
st1 {v0.s}[0], [x0], x2
st1 {v0.s}[1], [x0], x2
st1 {v0.s}[2], [x0], x2
@@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1
ld1 {v2.d}[0], [x12], x2
ld1 {v2.d}[1], [x12], x2
ld1 {v3.d}[0], [x12], x2
- sqadd v0.8h, v0.8h, v2.8h
+ sqadd v0.8h, v0.8h, v2.8h
ld1 {v3.d}[1], [x12], x2
- movi v4.8h, #0
- sqadd v1.8h, v1.8h, v3.8h
- mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
- clip10 v0.8h, v1.8h, v4.8h, v5.8h
- st1 {v0.d}[0], [x0], x2
- st1 {v0.d}[1], [x0], x2
- st1 {v1.d}[0], [x0], x2
- st1 {v1.d}[1], [x0], x2
+ movi v4.8h, #0
+ sqadd v1.8h, v1.8h, v3.8h
+ mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
+ clip10 v0.8h, v1.8h, v4.8h, v5.8h
+ st1 {v0.d}[0], [x0], x2
+ st1 {v0.d}[1], [x0], x2
+ st1 {v1.d}[0], [x0], x2
+ st1 {v1.d}[1], [x0], x2
ret
endfunc
function ff_hevc_add_residual_8x8_8_neon, export=1
- add x12, x0, x2
- add x2, x2, x2
- mov x3, #8
-1: subs x3, x3, #2
- ld1 {v2.d}[0], [x0]
- ld1 {v2.d}[1], [x12]
- uxtl v3.8h, v2.8b
+ add x12, x0, x2
+ add x2, x2, x2
+ mov x3, #8
+1: subs x3, x3, #2
+ ld1 {v2.d}[0], [x0]
+ ld1 {v2.d}[1], [x12]
+ uxtl v3.8h, v2.8b
ld1 {v0.8h-v1.8h}, [x1], #32
- uxtl2 v2.8h, v2.16b
- sqadd v0.8h, v0.8h, v3.8h
- sqadd v1.8h, v1.8h, v2.8h
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- st1 {v0.d}[0], [x0], x2
- st1 {v0.d}[1], [x12], x2
- bne 1b
+ uxtl2 v2.8h, v2.16b
+ sqadd v0.8h, v0.8h, v3.8h
+ sqadd v1.8h, v1.8h, v2.8h
+ sqxtun v0.8b, v0.8h
+ sqxtun2 v0.16b, v1.8h
+ st1 {v0.d}[0], [x0], x2
+ st1 {v0.d}[1], [x12], x2
+ bne 1b
ret
endfunc
function ff_hevc_add_residual_8x8_10_neon, export=1
- add x12, x0, x2
- add x2, x2, x2
- mov x3, #8
- movi v4.8h, #0
- mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1: subs x3, x3, #2
+ add x12, x0, x2
+ add x2, x2, x2
+ mov x3, #8
+ movi v4.8h, #0
+ mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
+1: subs x3, x3, #2
ld1 {v0.8h-v1.8h}, [x1], #32
- ld1 {v2.8h}, [x0]
- sqadd v0.8h, v0.8h, v2.8h
- ld1 {v3.8h}, [x12]
- sqadd v1.8h, v1.8h, v3.8h
- clip10 v0.8h, v1.8h, v4.8h, v5.8h
- st1 {v0.8h}, [x0], x2
- st1 {v1.8h}, [x12], x2
- bne 1b
+ ld1 {v2.8h}, [x0]
+ sqadd v0.8h, v0.8h, v2.8h
+ ld1 {v3.8h}, [x12]
+ sqadd v1.8h, v1.8h, v3.8h
+ clip10 v0.8h, v1.8h, v4.8h, v5.8h
+ st1 {v0.8h}, [x0], x2
+ st1 {v1.8h}, [x12], x2
+ bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_8_neon, export=1
- mov x3, #16
+ mov x3, #16
add x12, x0, x2
- add x2, x2, x2
-1: subs x3, x3, #2
+ add x2, x2, x2
+1: subs x3, x3, #2
ld1 {v16.16b}, [x0]
ld1 {v0.8h-v3.8h}, [x1], #64
ld1 {v19.16b}, [x12]
@@ -134,47 +134,47 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
uxtl2 v18.8h, v16.16b
uxtl v20.8h, v19.8b
uxtl2 v21.8h, v19.16b
- sqadd v0.8h, v0.8h, v17.8h
- sqadd v1.8h, v1.8h, v18.8h
- sqadd v2.8h, v2.8h, v20.8h
- sqadd v3.8h, v3.8h, v21.8h
- sqxtun v0.8b, v0.8h
+ sqadd v0.8h, v0.8h, v17.8h
+ sqadd v1.8h, v1.8h, v18.8h
+ sqadd v2.8h, v2.8h, v20.8h
+ sqadd v3.8h, v3.8h, v21.8h
+ sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
+ sqxtun v1.8b, v2.8h
sqxtun2 v1.16b, v3.8h
st1 {v0.16b}, [x0], x2
st1 {v1.16b}, [x12], x2
- bne 1b
+ bne 1b
ret
endfunc
function ff_hevc_add_residual_16x16_10_neon, export=1
- mov x3, #16
+ mov x3, #16
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
add x12, x0, x2
- add x2, x2, x2
-1: subs x3, x3, #2
+ add x2, x2, x2
+1: subs x3, x3, #2
ld1 {v16.8h-v17.8h}, [x0]
- ld1 {v0.8h-v3.8h}, [x1], #64
- sqadd v0.8h, v0.8h, v16.8h
+ ld1 {v0.8h-v3.8h}, [x1], #64
+ sqadd v0.8h, v0.8h, v16.8h
ld1 {v18.8h-v19.8h}, [x12]
- sqadd v1.8h, v1.8h, v17.8h
- sqadd v2.8h, v2.8h, v18.8h
- sqadd v3.8h, v3.8h, v19.8h
- clip10 v0.8h, v1.8h, v20.8h, v21.8h
- clip10 v2.8h, v3.8h, v20.8h, v21.8h
- st1 {v0.8h-v1.8h}, [x0], x2
- st1 {v2.8h-v3.8h}, [x12], x2
- bne 1b
+ sqadd v1.8h, v1.8h, v17.8h
+ sqadd v2.8h, v2.8h, v18.8h
+ sqadd v3.8h, v3.8h, v19.8h
+ clip10 v0.8h, v1.8h, v20.8h, v21.8h
+ clip10 v2.8h, v3.8h, v20.8h, v21.8h
+ st1 {v0.8h-v1.8h}, [x0], x2
+ st1 {v2.8h-v3.8h}, [x12], x2
+ bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_8_neon, export=1
add x12, x0, x2
- add x2, x2, x2
- mov x3, #32
-1: subs x3, x3, #2
+ add x2, x2, x2
+ mov x3, #32
+1: subs x3, x3, #2
ld1 {v20.16b, v21.16b}, [x0]
uxtl v16.8h, v20.8b
uxtl2 v17.8h, v20.16b
@@ -187,43 +187,43 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
uxtl2 v21.8h, v22.16b
uxtl v22.8h, v23.8b
uxtl2 v23.8h, v23.16b
- sqadd v0.8h, v0.8h, v16.8h
- sqadd v1.8h, v1.8h, v17.8h
- sqadd v2.8h, v2.8h, v18.8h
- sqadd v3.8h, v3.8h, v19.8h
- sqadd v4.8h, v4.8h, v20.8h
- sqadd v5.8h, v5.8h, v21.8h
- sqadd v6.8h, v6.8h, v22.8h
- sqadd v7.8h, v7.8h, v23.8h
- sqxtun v0.8b, v0.8h
+ sqadd v0.8h, v0.8h, v16.8h
+ sqadd v1.8h, v1.8h, v17.8h
+ sqadd v2.8h, v2.8h, v18.8h
+ sqadd v3.8h, v3.8h, v19.8h
+ sqadd v4.8h, v4.8h, v20.8h
+ sqadd v5.8h, v5.8h, v21.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sqxtun v0.8b, v0.8h
sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
+ sqxtun v1.8b, v2.8h
sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v4.8h
+ sqxtun v2.8b, v4.8h
sqxtun2 v2.16b, v5.8h
- st1 {v0.16b, v1.16b}, [x0], x2
- sqxtun v3.8b, v6.8h
+ st1 {v0.16b, v1.16b}, [x0], x2
+ sqxtun v3.8b, v6.8h
sqxtun2 v3.16b, v7.8h
st1 {v2.16b, v3.16b}, [x12], x2
- bne 1b
+ bne 1b
ret
endfunc
function ff_hevc_add_residual_32x32_10_neon, export=1
- mov x3, #32
+ mov x3, #32
movi v20.8h, #0
mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
-1: subs x3, x3, #1
- ld1 {v0.8h-v3.8h}, [x1], #64
+1: subs x3, x3, #1
+ ld1 {v0.8h -v3.8h}, [x1], #64
ld1 {v16.8h-v19.8h}, [x0]
- sqadd v0.8h, v0.8h, v16.8h
- sqadd v1.8h, v1.8h, v17.8h
- sqadd v2.8h, v2.8h, v18.8h
- sqadd v3.8h, v3.8h, v19.8h
- clip10 v0.8h, v1.8h, v20.8h, v21.8h
- clip10 v2.8h, v3.8h, v20.8h, v21.8h
- st1 {v0.8h-v3.8h}, [x0], x2
- bne 1b
+ sqadd v0.8h, v0.8h, v16.8h
+ sqadd v1.8h, v1.8h, v17.8h
+ sqadd v2.8h, v2.8h, v18.8h
+ sqadd v3.8h, v3.8h, v19.8h
+ clip10 v0.8h, v1.8h, v20.8h, v21.8h
+ clip10 v2.8h, v3.8h, v20.8h, v21.8h
+ st1 {v0.8h-v3.8h}, [x0], x2
+ bne 1b
ret
endfunc
More information about the ffmpeg-cvslog
mailing list