[FFmpeg-devel] [PATCH 6/6] avcodec/hevcdsp: Add NEON optimization for idct16x16
Shengbin Meng
shengbinmeng at gmail.com
Wed Nov 22 13:12:06 EET 2017
From: Meng Wang <wangmeng.kids at bytedance.com>
Signed-off-by: Meng Wang <wangmeng.kids at bytedance.com>
---
libavcodec/arm/hevcdsp_idct_neon.S | 241 +++++++++++++++++++++++++++++++++++++
libavcodec/arm/hevcdsp_init_neon.c | 2 +
2 files changed, 243 insertions(+)
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
index e39d00634b..272abf279c 100644
--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -451,6 +451,247 @@ function ff_hevc_transform_8x8_neon_8, export=1
bx lr
endfunc
+/* 16x16 even line combine, input: q3-q10 output: q8-q15 */
+.macro tr8_combine
+ vsub.s32 q12, q3, q10 // e_8[3] - o_8[3], dst[4]
+ vadd.s32 q11, q3, q10 // e_8[3] + o_8[3], dst[3]
+
+ vsub.s32 q13, q6, q9 // e_8[2] - o_8[2], dst[5]
+ vadd.s32 q10, q6, q9 // e_8[2] + o_8[2], dst[2]
+
+ vsub.s32 q14, q5, q8 // e_8[1] - o_8[1], dst[6]
+ vadd.s32 q9, q5, q8 // e_8[1] + o_8[1], dst[1]
+
+ vsub.s32 q15, q4, q7 // e_8[0] - o_8[0], dst[7]
+ vadd.s32 q8, q4, q7 // e_8[0] + o_8[0], dst[0]
+.endm
+
+.macro tr16_begin in0, in1, in2, in3, in4, in5, in6, in7
+ vmull.s16 q2, \in0, d2[1] // 90 * src1
+ vmull.s16 q3, \in0, d2[0] // 87 * src1
+ vmull.s16 q4, \in0, d2[3] // 80 * src1
+ vmull.s16 q5, \in0, d2[2] // 70 * src1
+ vmull.s16 q6, \in0, d3[1] // 57 * src1
+ vmull.s16 q7, \in0, d3[0] // 43 * src1
+ vmull.s16 q8, \in0, d3[3] // 25 * src1
+ vmull.s16 q9, \in0, d3[2] // 9 * src1
+
+ vmlal.s16 q2, \in1, d2[0] // 87 * src3
+ vmlal.s16 q3, \in1, d3[1] // 57 * src3
+ vmlal.s16 q4, \in1, d3[2] // 9 * src3
+ vmlsl.s16 q5, \in1, d3[0] //-43 * src3
+ vmlsl.s16 q6, \in1, d2[3] //-80 * src3
+ vmlsl.s16 q7, \in1, d2[1] //-90 * src3
+ vmlsl.s16 q8, \in1, d2[2] //-70 * src3
+ vmlsl.s16 q9, \in1, d3[3] //-25 * src3
+
+ vmlal.s16 q2, \in2, d2[3] // 80 * src5
+ vmlal.s16 q3, \in2, d3[2] // 9 * src5
+ vmlsl.s16 q4, \in2, d2[2] //-70 * src5
+ vmlsl.s16 q5, \in2, d2[0] //-87 * src5
+ vmlsl.s16 q6, \in2, d3[3] //-25 * src5
+ vmlal.s16 q7, \in2, d3[1] // 57 * src5
+ vmlal.s16 q8, \in2, d2[1] // 90 * src5
+ vmlal.s16 q9, \in2, d3[0] // 43 * src5
+
+ vmlal.s16 q2, \in3, d2[2] // 70 * src7
+ vmlsl.s16 q3, \in3, d3[0] //-43 * src7
+ vmlsl.s16 q4, \in3, d2[0] //-87 * src7
+ vmlal.s16 q5, \in3, d3[2] // 9 * src7
+ vmlal.s16 q6, \in3, d2[1] // 90 * src7
+ vmlal.s16 q7, \in3, d3[3] // 25 * src7
+ vmlsl.s16 q8, \in3, d2[3] //-80 * src7
+ vmlsl.s16 q9, \in3, d3[1] //-57 * src7
+
+ vmlal.s16 q2, \in4, d3[1] // 57 * src9
+ vmlsl.s16 q3, \in4, d2[3] //-80 * src9
+ vmlsl.s16 q4, \in4, d3[3] //-25 * src9
+ vmlal.s16 q5, \in4, d2[1] // 90 * src9
+ vmlsl.s16 q6, \in4, d3[2] // -9 * src9
+ vmlsl.s16 q7, \in4, d2[0] //-87 * src9
+ vmlal.s16 q8, \in4, d3[0] // 43 * src9
+ vmlal.s16 q9, \in4, d2[2] // 70 * src9
+
+ vmlal.s16 q2, \in5, d3[0] // 43 * src11
+ vmlsl.s16 q3, \in5, d2[1] //-90 * src11
+ vmlal.s16 q4, \in5, d3[1] // 57 * src11
+ vmlal.s16 q5, \in5, d3[3] // 25 * src11
+ vmlsl.s16 q6, \in5, d2[0] //-87 * src11
+ vmlal.s16 q7, \in5, d2[2] // 70 * src11
+ vmlal.s16 q8, \in5, d3[2] // 9 * src11
+ vmlsl.s16 q9, \in5, d2[3] //-80 * src11
+
+ vmlal.s16 q2, \in6, d3[3] // 25 * src13
+ vmlsl.s16 q3, \in6, d2[2] //-70 * src13
+ vmlal.s16 q4, \in6, d2[1] // 90 * src13
+ vmlsl.s16 q5, \in6, d2[3] //-80 * src13
+ vmlal.s16 q6, \in6, d3[0] // 43 * src13
+ vmlal.s16 q7, \in6, d3[2] // 9 * src13
+ vmlsl.s16 q8, \in6, d3[1] //-57 * src13
+ vmlal.s16 q9, \in6, d2[0] // 87 * src13
+
+
+ vmlal.s16 q2, \in7, d3[2] // 9 * src15
+ vmlsl.s16 q3, \in7, d3[3] //-25 * src15
+ vmlal.s16 q4, \in7, d3[0] // 43 * src15
+ vmlsl.s16 q5, \in7, d3[1] //-57 * src15
+ vmlal.s16 q6, \in7, d2[2] // 70 * src15
+ vmlsl.s16 q7, \in7, d2[3] //-80 * src15
+ vmlal.s16 q8, \in7, d2[0] // 87 * src15
+ vmlsl.s16 q9, \in7, d2[1] //-90 * src15
+.endm
+
+.macro tr16_end shift
+ vpop {q2-q3}
+ vadd.s32 q4, q8, q2
+ vsub.s32 q5, q8, q2
+ vqrshrn.s32 d12, q4, \shift
+ vqrshrn.s32 d15, q5, \shift
+
+ vadd.s32 q4, q9, q3
+ vsub.s32 q5, q9, q3
+ vqrshrn.s32 d13, q4, \shift
+ vqrshrn.s32 d14, q5, \shift
+
+ vpop {q2-q3}
+ vadd.s32 q4, q10, q2
+ vsub.s32 q5, q10, q2
+ vqrshrn.s32 d16, q4, \shift
+ vqrshrn.s32 d19, q5, \shift
+
+ vadd.s32 q4, q11, q3
+ vsub.s32 q5, q11, q3
+ vqrshrn.s32 d17, q4, \shift
+ vqrshrn.s32 d18, q5, \shift
+
+ vpop {q2-q3}
+ vadd.s32 q4, q12, q2
+ vsub.s32 q5, q12, q2
+ vqrshrn.s32 d20, q4, \shift
+ vqrshrn.s32 d23, q5, \shift
+
+ vadd.s32 q4, q13, q3
+ vsub.s32 q5, q13, q3
+ vqrshrn.s32 d21, q4, \shift
+ vqrshrn.s32 d22, q5, \shift
+
+ vpop {q2-q3}
+ vadd.s32 q4, q14, q2
+ vsub.s32 q5, q14, q2
+ vqrshrn.s32 d24, q4, \shift
+ vqrshrn.s32 d27, q5, \shift
+
+ vadd.s32 q4, q15, q3
+ vsub.s32 q5, q15, q3
+ vqrshrn.s32 d25, q4, \shift
+ vqrshrn.s32 d26, q5, \shift
+.endm
+
+function ff_hevc_transform_16x16_neon_8, export=1
+ push {r4-r8}
+ vpush {d8-d15}
+ mov r5, #64
+ mov r6, #32
+ mov r7, #0
+ adr r3, tr4f
+ vld1.16 {d0, d1, d2, d3}, [r3]
+ mov r8, r0
+0:
+ add r7, #4
+ add r0, #32
+ // odd line
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #544
+
+ tr16_begin d24, d25, d26, d27, d28, d29, d30, d31
+ vpush {q2-q9}
+
+ // even line
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #512
+
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_combine
+
+ // combine
+ tr16_end #7
+
+ // store
+ vst1.16 {d12}, [r0], r6
+ vst1.16 {d13}, [r0], r6
+ vst1.16 {d16}, [r0], r6
+ vst1.16 {d17}, [r0], r6
+ vst1.16 {d20}, [r0], r6
+ vst1.16 {d21}, [r0], r6
+ vst1.16 {d24}, [r0], r6
+ vst1.16 {d25}, [r0], r6
+ vst1.16 {d26}, [r0], r6
+ vst1.16 {d27}, [r0], r6
+ vst1.16 {d22}, [r0], r6
+ vst1.16 {d23}, [r0], r6
+ vst1.16 {d18}, [r0], r6
+ vst1.16 {d19}, [r0], r6
+ vst1.16 {d14}, [r0], r6
+ vst1.16 {d15}, [r0], r6
+ sub r0, #504 // 512 - 8
+
+ cmp r1, r7
+ blt 1f
+
+ cmp r7, #16
+ blt 0b
+
+1: mov r0, r8
+ mov r7, #4
+2: subs r7, #1
+ // 1st 4 line
+ vldm r0, {q8-q15} // coeffs
+ transpose_16b_4x4 d16, d20, d24, d28
+ transpose_16b_4x4 d17, d21, d25, d29
+ transpose_16b_4x4 d18, d22, d26, d30
+ transpose_16b_4x4 d19, d23, d27, d31
+ vpush {q12-q13} // 16x16 even line (8x8 odd line)
+ vpush {q8-q9} // 16x16 even line (8x8 even line)
+ tr16_begin d20, d28, d21, d29, d22, d30, d23, d31 // odd line transform 2n+1
+ vpop {q12-q15} // pop even line
+ vpush {q2-q9} // push results of 16x16 odd line
+ tr8_begin d28, d29, d30, d31 // even line transform 2n
+ tr4 d24, d25, d26, d27
+ tr8_combine
+ tr16_end #12
+ transpose_16b_4x4 d12, d13, d16, d17
+ transpose_16b_4x4 d20, d21, d24, d25
+ transpose_16b_4x4 d26, d27, d22, d23
+ transpose_16b_4x4 d18, d19, d14, d15
+ vswp d13, d20
+ vswp d14, d23
+ vswp d17, d24
+ vswp d18, d27
+ vswp q8, q10
+ vswp q7, q13
+ vstm r0!, {q6-q13}
+ bne 2b
+
+ vpop {d8-d15}
+ pop {r4-r8}
+ bx lr
+endfunc
+
.align 4
tr4f:
.word 0x00240053 // 36 and d1[0] = 83
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 33cc44ef40..d846d01081 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -36,6 +36,7 @@ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_t
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_16x16_neon_8(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
@@ -550,6 +551,7 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_neon_wrapper;
c->idct[0] = ff_hevc_transform_4x4_neon_8;
c->idct[1] = ff_hevc_transform_8x8_neon_8;
+ c->idct[2] = ff_hevc_transform_16x16_neon_8;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
--
2.13.6 (Apple Git-96)
More information about the ffmpeg-devel
mailing list