[FFmpeg-cvslog] aarch64: vp9itxfm: Restructure the idct32 store macros

Martin Storsjö git at videolan.org
Wed Sep 27 01:17:27 EEST 2017


ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Thu Dec  1 11:10:19 2016 +0200| [58d87e0f49bcbbc6f426328f53b657bae7430cd2] | committer: Martin Storsjö

aarch64: vp9itxfm: Restructure the idct32 store macros

This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.

This is also arguably more readable.

Signed-off-by: Martin Storsjö <martin at martin.st>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=58d87e0f49bcbbc6f426328f53b657bae7430cd2
---

 libavcodec/aarch64/vp9itxfm_neon.S | 80 +++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 7ce6df0a6d..c14c5f9ded 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -935,23 +935,23 @@ function idct32_1d_8x32_pass1_neon
 .macro store_rev a, b
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, v\b\().8h
-        st1             {v\a\().8h},  [x0], #16
-        rev64           v0.8h, v\a\().8h
+        rev64           v1.8h, \b
+        st1             {\a},  [x0], #16
+        rev64           v0.8h, \a
         ext             v1.16b, v1.16b, v1.16b, #8
-        st1             {v\b\().8h},  [x0], #16
+        st1             {\b},  [x0], #16
         ext             v0.16b, v0.16b, v0.16b, #8
         st1             {v1.8h},  [x0], #16
         st1             {v0.8h},  [x0], #16
 .endm
-        store_rev       16, 24
-        store_rev       17, 25
-        store_rev       18, 26
-        store_rev       19, 27
-        store_rev       20, 28
-        store_rev       21, 29
-        store_rev       22, 30
-        store_rev       23, 31
+        store_rev       v16.8h, v24.8h
+        store_rev       v17.8h, v25.8h
+        store_rev       v18.8h, v26.8h
+        store_rev       v19.8h, v27.8h
+        store_rev       v20.8h, v28.8h
+        store_rev       v21.8h, v29.8h
+        store_rev       v22.8h, v30.8h
+        store_rev       v23.8h, v31.8h
         sub             x0,  x0,  #512
 .purgem store_rev
 
@@ -977,14 +977,14 @@ function idct32_1d_8x32_pass1_neon
         // subtracted from the output.
 .macro store_rev a, b
         ld1             {v4.8h},  [x0]
-        rev64           v1.8h, v\b\().8h
-        add             v4.8h, v4.8h, v\a\().8h
-        rev64           v0.8h, v\a\().8h
+        rev64           v1.8h, \b
+        add             v4.8h, v4.8h, \a
+        rev64           v0.8h, \a
         st1             {v4.8h},  [x0], #16
         ext             v1.16b, v1.16b, v1.16b, #8
         ld1             {v5.8h},  [x0]
         ext             v0.16b, v0.16b, v0.16b, #8
-        add             v5.8h, v5.8h, v\b\().8h
+        add             v5.8h, v5.8h, \b
         st1             {v5.8h},  [x0], #16
         ld1             {v6.8h},  [x0]
         sub             v6.8h, v6.8h, v1.8h
@@ -994,14 +994,14 @@ function idct32_1d_8x32_pass1_neon
         st1             {v7.8h},  [x0], #16
 .endm
 
-        store_rev       31, 23
-        store_rev       30, 22
-        store_rev       29, 21
-        store_rev       28, 20
-        store_rev       27, 19
-        store_rev       26, 18
-        store_rev       25, 17
-        store_rev       24, 16
+        store_rev       v31.8h, v23.8h
+        store_rev       v30.8h, v22.8h
+        store_rev       v29.8h, v21.8h
+        store_rev       v28.8h, v20.8h
+        store_rev       v27.8h, v19.8h
+        store_rev       v26.8h, v18.8h
+        store_rev       v25.8h, v17.8h
+        store_rev       v24.8h, v16.8h
 .purgem store_rev
         ret
 endfunc
@@ -1047,21 +1047,21 @@ function idct32_1d_8x32_pass2_neon
 .if \neg == 0
         ld1             {v4.8h},  [x2], x9
         ld1             {v5.8h},  [x2], x9
-        add             v4.8h, v4.8h, v\a\().8h
+        add             v4.8h, v4.8h, \a
         ld1             {v6.8h},  [x2], x9
-        add             v5.8h, v5.8h, v\b\().8h
+        add             v5.8h, v5.8h, \b
         ld1             {v7.8h},  [x2], x9
-        add             v6.8h, v6.8h, v\c\().8h
-        add             v7.8h, v7.8h, v\d\().8h
+        add             v6.8h, v6.8h, \c
+        add             v7.8h, v7.8h, \d
 .else
         ld1             {v4.8h},  [x2], x7
         ld1             {v5.8h},  [x2], x7
-        sub             v4.8h, v4.8h, v\a\().8h
+        sub             v4.8h, v4.8h, \a
         ld1             {v6.8h},  [x2], x7
-        sub             v5.8h, v5.8h, v\b\().8h
+        sub             v5.8h, v5.8h, \b
         ld1             {v7.8h},  [x2], x7
-        sub             v6.8h, v6.8h, v\c\().8h
-        sub             v7.8h, v7.8h, v\d\().8h
+        sub             v6.8h, v6.8h, \c
+        sub             v7.8h, v7.8h, \d
 .endif
         ld1             {v0.8b}, [x0], x1
         ld1             {v1.8b}, [x0], x1
@@ -1085,15 +1085,15 @@ function idct32_1d_8x32_pass2_neon
         st1             {v6.8b}, [x0], x1
         st1             {v7.8b}, [x0], x1
 .endm
-        load_acc_store  31, 30, 29, 28
-        load_acc_store  27, 26, 25, 24
-        load_acc_store  23, 22, 21, 20
-        load_acc_store  19, 18, 17, 16
+        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
+        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
+        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
+        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
         sub             x2,  x2,  x9
-        load_acc_store  16, 17, 18, 19, 1
-        load_acc_store  20, 21, 22, 23, 1
-        load_acc_store  24, 25, 26, 27, 1
-        load_acc_store  28, 29, 30, 31, 1
+        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
+        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
+        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
+        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
 .purgem load_acc_store
         ret
 endfunc



More information about the ffmpeg-cvslog mailing list