[FFmpeg-cvslog] aarch64: vp9itxfm: Restructure the idct32 store macros
Martin Storsjö
git at videolan.org
Wed Sep 27 01:17:27 EEST 2017
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Thu Dec 1 11:10:19 2016 +0200| [58d87e0f49bcbbc6f426328f53b657bae7430cd2] | committer: Martin Storsjö
aarch64: vp9itxfm: Restructure the idct32 store macros
This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.
This is also arguably more readable.
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=58d87e0f49bcbbc6f426328f53b657bae7430cd2
---
libavcodec/aarch64/vp9itxfm_neon.S | 80 +++++++++++++++++++-------------------
1 file changed, 40 insertions(+), 40 deletions(-)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 7ce6df0a6d..c14c5f9ded 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -935,23 +935,23 @@ function idct32_1d_8x32_pass1_neon
.macro store_rev a, b
// There's no rev128 instruction, but we reverse each 64 bit
// half, and then flip them using an ext with 8 bytes offset.
- rev64 v1.8h, v\b\().8h
- st1 {v\a\().8h}, [x0], #16
- rev64 v0.8h, v\a\().8h
+ rev64 v1.8h, \b
+ st1 {\a}, [x0], #16
+ rev64 v0.8h, \a
ext v1.16b, v1.16b, v1.16b, #8
- st1 {v\b\().8h}, [x0], #16
+ st1 {\b}, [x0], #16
ext v0.16b, v0.16b, v0.16b, #8
st1 {v1.8h}, [x0], #16
st1 {v0.8h}, [x0], #16
.endm
- store_rev 16, 24
- store_rev 17, 25
- store_rev 18, 26
- store_rev 19, 27
- store_rev 20, 28
- store_rev 21, 29
- store_rev 22, 30
- store_rev 23, 31
+ store_rev v16.8h, v24.8h
+ store_rev v17.8h, v25.8h
+ store_rev v18.8h, v26.8h
+ store_rev v19.8h, v27.8h
+ store_rev v20.8h, v28.8h
+ store_rev v21.8h, v29.8h
+ store_rev v22.8h, v30.8h
+ store_rev v23.8h, v31.8h
sub x0, x0, #512
.purgem store_rev
@@ -977,14 +977,14 @@ function idct32_1d_8x32_pass1_neon
// subtracted from the output.
.macro store_rev a, b
ld1 {v4.8h}, [x0]
- rev64 v1.8h, v\b\().8h
- add v4.8h, v4.8h, v\a\().8h
- rev64 v0.8h, v\a\().8h
+ rev64 v1.8h, \b
+ add v4.8h, v4.8h, \a
+ rev64 v0.8h, \a
st1 {v4.8h}, [x0], #16
ext v1.16b, v1.16b, v1.16b, #8
ld1 {v5.8h}, [x0]
ext v0.16b, v0.16b, v0.16b, #8
- add v5.8h, v5.8h, v\b\().8h
+ add v5.8h, v5.8h, \b
st1 {v5.8h}, [x0], #16
ld1 {v6.8h}, [x0]
sub v6.8h, v6.8h, v1.8h
@@ -994,14 +994,14 @@ function idct32_1d_8x32_pass1_neon
st1 {v7.8h}, [x0], #16
.endm
- store_rev 31, 23
- store_rev 30, 22
- store_rev 29, 21
- store_rev 28, 20
- store_rev 27, 19
- store_rev 26, 18
- store_rev 25, 17
- store_rev 24, 16
+ store_rev v31.8h, v23.8h
+ store_rev v30.8h, v22.8h
+ store_rev v29.8h, v21.8h
+ store_rev v28.8h, v20.8h
+ store_rev v27.8h, v19.8h
+ store_rev v26.8h, v18.8h
+ store_rev v25.8h, v17.8h
+ store_rev v24.8h, v16.8h
.purgem store_rev
ret
endfunc
@@ -1047,21 +1047,21 @@ function idct32_1d_8x32_pass2_neon
.if \neg == 0
ld1 {v4.8h}, [x2], x9
ld1 {v5.8h}, [x2], x9
- add v4.8h, v4.8h, v\a\().8h
+ add v4.8h, v4.8h, \a
ld1 {v6.8h}, [x2], x9
- add v5.8h, v5.8h, v\b\().8h
+ add v5.8h, v5.8h, \b
ld1 {v7.8h}, [x2], x9
- add v6.8h, v6.8h, v\c\().8h
- add v7.8h, v7.8h, v\d\().8h
+ add v6.8h, v6.8h, \c
+ add v7.8h, v7.8h, \d
.else
ld1 {v4.8h}, [x2], x7
ld1 {v5.8h}, [x2], x7
- sub v4.8h, v4.8h, v\a\().8h
+ sub v4.8h, v4.8h, \a
ld1 {v6.8h}, [x2], x7
- sub v5.8h, v5.8h, v\b\().8h
+ sub v5.8h, v5.8h, \b
ld1 {v7.8h}, [x2], x7
- sub v6.8h, v6.8h, v\c\().8h
- sub v7.8h, v7.8h, v\d\().8h
+ sub v6.8h, v6.8h, \c
+ sub v7.8h, v7.8h, \d
.endif
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
@@ -1085,15 +1085,15 @@ function idct32_1d_8x32_pass2_neon
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
.endm
- load_acc_store 31, 30, 29, 28
- load_acc_store 27, 26, 25, 24
- load_acc_store 23, 22, 21, 20
- load_acc_store 19, 18, 17, 16
+ load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
+ load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
+ load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
+ load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
sub x2, x2, x9
- load_acc_store 16, 17, 18, 19, 1
- load_acc_store 20, 21, 22, 23, 1
- load_acc_store 24, 25, 26, 27, 1
- load_acc_store 28, 29, 30, 31, 1
+ load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
+ load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
+ load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
+ load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
.purgem load_acc_store
ret
endfunc
More information about the ffmpeg-cvslog
mailing list