[FFmpeg-devel] [PATCH 4/6] avcodec/hevcdsp: Use pre-load (pld) to optimize data loading
Shengbin Meng
shengbinmeng at gmail.com
Wed Nov 22 13:12:04 EET 2017
From: Meng Wang <wangmeng.kids at bytedance.com>
Signed-off-by: Meng Wang <wangmeng.kids at bytedance.com>
---
libavcodec/arm/hevcdsp_epel_neon.S | 10 ++++++++++
libavcodec/arm/hevcdsp_qpel_neon.S | 24 ++++++++++++++++++++----
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
index d0d93e8033..03e6504481 100644
--- a/libavcodec/arm/hevcdsp_epel_neon.S
+++ b/libavcodec/arm/hevcdsp_epel_neon.S
@@ -306,6 +306,7 @@
cmp r5, #2
beq 2f
8: subs r4, #1
+ pld [r2]
\filter
vst1.16 {q7}, [r0], r1
regshuffle_d4
@@ -320,6 +321,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vst1.16 d14, [r0], r1
regshuffle_d4
@@ -357,6 +359,7 @@
cmp r5, #2
beq 2f
8: subs r4, #1
+ pld [r2]
\filter
vqrshrun.s16 d0, q7, #6
vst1.8 d0, [r0], r1
@@ -372,6 +375,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vqrshrun.s16 d0, q7, #6
vst1.32 d0[0], [r0], r1
@@ -396,6 +400,7 @@
cmp r5, #2
beq 2f
8: subs r4, #1
+ pld [r2]
\filter
vld1.16 {q0}, [r8], r9
vqadd.s16 q0, q7
@@ -415,6 +420,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vld1.16 d0, [r8], r9
vqadd.s16 d0, d14
@@ -465,6 +471,7 @@
cmp r5, #2
beq 2f
8: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmovl.s16 q13, d15
@@ -490,6 +497,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmul.s32 q14, q12, q6
@@ -535,6 +543,7 @@
cmp r5, #2
beq 2f
8: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmovl.s16 q13, d15
@@ -569,6 +578,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14
vmul.s32 q14, q12, q6
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
index 71ecc00b6e..b507fbc13b 100644
--- a/libavcodec/arm/hevcdsp_qpel_neon.S
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -231,6 +231,7 @@
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
\filter
vst1.16 {q7}, [r0], r1
regshuffle_d8
@@ -245,6 +246,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vst1.16 d14, [r0], r1
regshuffle_d8
@@ -273,6 +275,7 @@
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
\filter
vqrshrun.s16 d0, q7, #6
vst1.8 d0, [r0], r1
@@ -288,6 +291,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vqrshrun.s16 d0, q7, #6
vst1.32 d0[0], [r0], r1
@@ -301,6 +305,7 @@
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
\filter
vld1.16 {q0}, [r8], r9
vqadd.s16 q0, q7
@@ -320,6 +325,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vld1.16 d0, [r8], r9
vqadd.s16 d0, d14
@@ -358,6 +364,7 @@
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmovl.s16 q13, d15
@@ -383,6 +390,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmul.s32 q14, q12, q6
@@ -412,6 +420,7 @@
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmovl.s16 q13, d15
@@ -446,6 +455,7 @@
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
\filter
vmovl.s16 q12, d14
vmul.s32 q14, q12, q6
@@ -1524,8 +1534,9 @@ function ff_hevc_put_qpel_bi_uw_pixels_neon_8, export=1
cmp r5, #4
beq 4f
8: subs r4, #1
- vshll.u8 q7 , d8, #6 // src[x] << 6 and move long to 8x16bit
+ pld [r2]
vld1.16 {q0}, [r8], r9 // load 8x16bit src2
+ vshll.u8 q7 , d8, #6 // src[x] << 6 and move long to 8x16bit
vqadd.s16 q0, q7 // ((src << 6) + src2) on 8x16bit operation
vqrshrun.s16 d0, q0, #7 // (((src << 6) + src2) + offset) >> 7 narrow to 8x8bit
vst1.8 d0, [r0], r1
@@ -1535,15 +1546,16 @@ function ff_hevc_put_qpel_bi_uw_pixels_neon_8, export=1
beq 99f
mov r4, r12
add r6, #8
- mov r0, r6
add r10, #16
- mov r8, r10
add r7, #8
+ mov r0, r6
+ mov r8, r10
mov r2, r7
b 0b
4: subs r4, #1
- vshll.u8 q7 , d8, #6 // src[x] << 6 and move long to 8x16bit
+ pld [r2]
vld1.16 d0, [r8], r9
+ vshll.u8 q7 , d8, #6 // src[x] << 6 and move long to 8x16bit
vqadd.s16 d0, d14
vqrshrun.s16 d0, q0, #7
vst1.32 d0[0], [r0], r1
@@ -1578,6 +1590,7 @@ function ff_hevc_put_qpel_wt_pixels_neon_8, export=1
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
vshll.u8 q7 , d16, #6 // src[x] << 6 and move long to 8x16bit
vmovl.u16 q12, d14 // extending unsigned 4x16bit data to 4x32 bit
vmovl.u16 q13, d15
@@ -1602,6 +1615,7 @@ function ff_hevc_put_qpel_wt_pixels_neon_8, export=1
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
vshll.u8 q7 , d16, #6 // src[x] << 6 and move long to 8x16bit
vmovl.u16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmul.u32 q14, q12, q6
@@ -1632,6 +1646,7 @@ function ff_hevc_put_qpel_wt_pixels_neon_8, export=1
cmp r5, #4
beq 4f
8: subs r4, #1
+ pld [r2]
vshll.u8 q7, d16, #6 // src[x] << 6 and move long to 8x16bit
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmovl.s16 q13, d15
@@ -1665,6 +1680,7 @@ function ff_hevc_put_qpel_wt_pixels_neon_8, export=1
mov r2, r7
b 0b
4: subs r4, #1
+ pld [r2]
vshll.u8 q7, d16, #6 // src[x] << 6 and move long to 8x16bit
vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
vmul.s32 q14, q12, q6 // src * w1
--
2.13.6 (Apple Git-96)
More information about the ffmpeg-devel
mailing list