[FFmpeg-cvslog] arm: vp9itxfm: Move the load_add_store macro out from the itxfm16 pass2 function
Martin Storsjö
git at videolan.org
Thu Sep 28 01:38:41 EEST 2017
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Sun Feb 5 22:55:20 2017 +0200| [47b3c2c18d1897f3c753ba0cec4b2d7aa24526af] | committer: Martin Storsjö
arm: vp9itxfm: Move the load_add_store macro out from the itxfm16 pass2 function
This allows reusing the macro for a separate implementation of the
pass2 function.
Signed-off-by: Martin Storsjö <martin at martin.st>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=47b3c2c18d1897f3c753ba0cec4b2d7aa24526af
---
libavcodec/arm/vp9itxfm_neon.S | 72 +++++++++++++++++++++---------------------
1 file changed, 36 insertions(+), 36 deletions(-)
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index fd53a20a73..b3188bc711 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -657,6 +657,42 @@ function iadst16
bx lr
endfunc
+.macro load_add_store coef0, coef1, coef2, coef3
+ vrshr.s16 \coef0, \coef0, #6
+ vrshr.s16 \coef1, \coef1, #6
+
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d4[1]}, [r3,:32], r1
+ vrshr.s16 \coef2, \coef2, #6
+ vrshr.s16 \coef3, \coef3, #6
+ vld1.32 {d5[]}, [r0,:32], r1
+ vld1.32 {d5[1]}, [r3,:32], r1
+ vaddw.u8 \coef0, \coef0, d4
+ vld1.32 {d6[]}, [r0,:32], r1
+ vld1.32 {d6[1]}, [r3,:32], r1
+ vaddw.u8 \coef1, \coef1, d5
+ vld1.32 {d7[]}, [r0,:32], r1
+ vld1.32 {d7[1]}, [r3,:32], r1
+
+ vqmovun.s16 d4, \coef0
+ vqmovun.s16 d5, \coef1
+ sub r0, r0, r1, lsl #2
+ sub r3, r3, r1, lsl #2
+ vaddw.u8 \coef2, \coef2, d6
+ vaddw.u8 \coef3, \coef3, d7
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r3,:32], r1
+ vqmovun.s16 d6, \coef2
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r3,:32], r1
+ vqmovun.s16 d7, \coef3
+
+ vst1.32 {d6[0]}, [r0,:32], r1
+ vst1.32 {d6[1]}, [r3,:32], r1
+ vst1.32 {d7[0]}, [r0,:32], r1
+ vst1.32 {d7[1]}, [r3,:32], r1
+.endm
+
.macro itxfm16_1d_funcs txfm
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store.
@@ -739,44 +775,8 @@ function \txfm\()16_1d_4x16_pass2_neon
lsl r1, r1, #1
bl \txfm\()16
-.macro load_add_store coef0, coef1, coef2, coef3
- vrshr.s16 \coef0, \coef0, #6
- vrshr.s16 \coef1, \coef1, #6
-
- vld1.32 {d4[]}, [r0,:32], r1
- vld1.32 {d4[1]}, [r3,:32], r1
- vrshr.s16 \coef2, \coef2, #6
- vrshr.s16 \coef3, \coef3, #6
- vld1.32 {d5[]}, [r0,:32], r1
- vld1.32 {d5[1]}, [r3,:32], r1
- vaddw.u8 \coef0, \coef0, d4
- vld1.32 {d6[]}, [r0,:32], r1
- vld1.32 {d6[1]}, [r3,:32], r1
- vaddw.u8 \coef1, \coef1, d5
- vld1.32 {d7[]}, [r0,:32], r1
- vld1.32 {d7[1]}, [r3,:32], r1
-
- vqmovun.s16 d4, \coef0
- vqmovun.s16 d5, \coef1
- sub r0, r0, r1, lsl #2
- sub r3, r3, r1, lsl #2
- vaddw.u8 \coef2, \coef2, d6
- vaddw.u8 \coef3, \coef3, d7
- vst1.32 {d4[0]}, [r0,:32], r1
- vst1.32 {d4[1]}, [r3,:32], r1
- vqmovun.s16 d6, \coef2
- vst1.32 {d5[0]}, [r0,:32], r1
- vst1.32 {d5[1]}, [r3,:32], r1
- vqmovun.s16 d7, \coef3
-
- vst1.32 {d6[0]}, [r0,:32], r1
- vst1.32 {d6[1]}, [r3,:32], r1
- vst1.32 {d7[0]}, [r0,:32], r1
- vst1.32 {d7[1]}, [r3,:32], r1
-.endm
load_add_store q8, q9, q10, q11
load_add_store q12, q13, q14, q15
-.purgem load_add_store
pop {pc}
endfunc
More information about the ffmpeg-cvslog
mailing list