[FFmpeg-cvslog] aarch64: Make transpose_4x4H do a regular transpose
Martin Storsjö
git at videolan.org
Sun Apr 24 13:51:51 CEST 2016
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Fri Mar 25 23:44:10 2016 +0200| [cdb1665f70def544ddab3e3ed3763ef99c8b3873] | committer: Martin Storsjö
aarch64: Make transpose_4x4H do a regular transpose
Previously, ff_h264_idct_add_neon (originally in the arm version) used
a non-regular transpose in order to be able to use more instructions
that deal with registers as 128 bit register pairs. The aarch64
translation doesn't do it to the same extent, but brought along the
same structure since it was a straight translation.
This reshuffles ff_h264_idct_add_neon, bringing it closer to
the C implementation, making the transpose_4x4H macro do a regular
transpose, usable for other algorithms as well.
Previously, the third and fourth output from transpose_4x4H were
swapped, and prior to cc29d96d5a, the same inputs as well. In
addition to just swapping the outputs, also renumber the intermediate
registers for better readability (making the register order match
transpose_4x8B).
This runs with the same number of cycles as before.
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cdb1665f70def544ddab3e3ed3763ef99c8b3873
---
libavcodec/aarch64/h264idct_neon.S | 24 ++++++++++++------------
libavcodec/aarch64/neon.S | 12 ++++++------
2 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 78f780a..5395e14 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -33,25 +33,25 @@ function ff_h264_idct_add_neon, export=1
sshr v17.4H, v3.4H, #1
st1 {v30.8H}, [x1], #16
sub v5.4H, v0.4H, v2.4H
- add v6.4H, v1.4H, v17.4H
- sub v7.4H, v16.4H, v3.4H
- add v0.4H, v4.4H, v6.4H
- add v1.4H, v5.4H, v7.4H
- sub v3.4H, v4.4H, v6.4H
- sub v2.4H, v5.4H, v7.4H
+ sub v6.4H, v16.4H, v3.4H
+ add v7.4H, v1.4H, v17.4H
+ add v0.4H, v4.4H, v7.4H
+ add v1.4H, v5.4H, v6.4H
+ sub v2.4H, v5.4H, v6.4H
+ sub v3.4H, v4.4H, v7.4H
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
- add v4.4H, v0.4H, v3.4H
+ add v4.4H, v0.4H, v2.4H
ld1 {v18.S}[0], [x0], x2
- sshr v16.4H, v2.4H, #1
+ sshr v16.4H, v3.4H, #1
sshr v17.4H, v1.4H, #1
- ld1 {v19.S}[1], [x0], x2
- sub v5.4H, v0.4H, v3.4H
ld1 {v18.S}[1], [x0], x2
+ sub v5.4H, v0.4H, v2.4H
+ ld1 {v19.S}[1], [x0], x2
add v6.4H, v16.4H, v1.4H
ins v4.D[1], v5.D[0]
- sub v7.4H, v2.4H, v17.4H
+ sub v7.4H, v17.4H, v3.4H
ld1 {v19.S}[0], [x0], x2
ins v6.D[1], v7.D[0]
sub x0, x0, x2, lsl #2
@@ -68,8 +68,8 @@ function ff_h264_idct_add_neon, export=1
sqxtun v1.8B, v1.8H
st1 {v0.S}[0], [x0], x2
- st1 {v1.S}[1], [x0], x2
st1 {v0.S}[1], [x0], x2
+ st1 {v1.S}[1], [x0], x2
st1 {v1.S}[0], [x0], x2
sub x1, x1, #32
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 767bc9d..377009e 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -107,12 +107,12 @@
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H
- trn1 \r7\().4H, \r2\().4H, \r3\().4H
- trn2 \r6\().4H, \r2\().4H, \r3\().4H
- trn1 \r0\().2S, \r4\().2S, \r7\().2S
- trn2 \r3\().2S, \r4\().2S, \r7\().2S
- trn1 \r1\().2S, \r5\().2S, \r6\().2S
- trn2 \r2\().2S, \r5\().2S, \r6\().2S
+ trn1 \r6\().4H, \r2\().4H, \r3\().4H
+ trn2 \r7\().4H, \r2\().4H, \r3\().4H
+ trn1 \r0\().2S, \r4\().2S, \r6\().2S
+ trn2 \r2\().2S, \r4\().2S, \r6\().2S
+ trn1 \r1\().2S, \r5\().2S, \r7\().2S
+ trn2 \r3\().2S, \r5\().2S, \r7\().2S
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
More information about the ffmpeg-cvslog
mailing list